diff options
Diffstat (limited to 'net')
113 files changed, 6251 insertions, 1192 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 2a78da4072de..990b9fde28c6 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -647,8 +647,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; struct phy_device *phydev = vlan->real_dev->phydev; - if (phydev && phydev->drv && phydev->drv->ts_info) { - return phydev->drv->ts_info(phydev, info); + if (phy_has_tsinfo(phydev)) { + return phy_ts_info(phydev, info); } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { diff --git a/net/Kconfig b/net/Kconfig index bd191f978a23..54916b7adb9b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -108,9 +108,10 @@ config NETWORK_PHY_TIMESTAMPING bool "Timestamping in PHY devices" select NET_PTP_CLASSIFY help - This allows timestamping of network packets by PHYs with - hardware timestamping capabilities. This option adds some - overhead in the transmit and receive paths. + This allows timestamping of network packets by PHYs (or + other MII bus snooping devices) with hardware timestamping + capabilities. This option adds some overhead in the transmit + and receive paths. If you are unsure how to answer this question, answer N. @@ -448,6 +449,14 @@ config FAILOVER migration of VMs with direct attached VFs by failing over to the paravirtual datapath when the VF is unplugged. +config ETHTOOL_NETLINK + bool "Netlink interface for ethtool" + default y + help + An alternative userspace interface for ethtool based on generic + netlink. It provides better extensibility and some new features, + e.g. notification messages. + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/Makefile b/net/Makefile index 449fc0b221f8..848303d98d3d 100644 --- a/net/Makefile +++ b/net/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ diff --git a/net/atm/lec.c b/net/atm/lec.c index 5a77c235a212..b57368e70aab 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) dev->stats.tx_bytes += skb->len; } -static void lec_tx_timeout(struct net_device *dev) +static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) { pr_info("%s\n", dev->name); netif_trans_update(dev); diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 1d4d7d415730..cc1cff63194f 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -112,7 +112,7 @@ static int bnep_net_set_mac_addr(struct net_device *dev, void *arg) return 0; } -static void bnep_net_timeout(struct net_device *dev) +static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue) { BT_DBG("net_timeout"); netif_wake_queue(dev); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f79205d4444f..d555c0d8657d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,7 +15,7 @@ #include <trace/events/bpf_test_run.h> static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, - u32 *retval, u32 *time) + u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; enum bpf_cgroup_storage_type stype; @@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { bpf_cgroup_storage_set(storage); - *retval = BPF_PROG_RUN(prog, ctx); + + if (xdp) + *retval = bpf_prog_run_xdp(prog, ctx); + else + *retval = BPF_PROG_RUN(prog, ctx); if (signal_pending(current)) { ret = -EINTR; @@ -247,34 +251,53 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return 0; /* make sure the fields we don't use are zeroed */ - if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) + return -EINVAL; + + /* mark is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), + offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + - sizeof_field(struct __sk_buff, priority), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + - sizeof_field(struct __sk_buff, cb), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ + /* wire_len is allowed */ + /* gso_segs is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + - sizeof_field(struct __sk_buff, tstamp), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), sizeof(struct __sk_buff))) return -EINVAL; + skb->mark = __skb->mark; skb->priority = __skb->priority; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + if (__skb->wire_len == 0) { + cb->pkt_len = skb->len; + } else { + if (__skb->wire_len < skb->len || + __skb->wire_len > GSO_MAX_SIZE) + return -EINVAL; + cb->pkt_len = __skb->wire_len; + } + + if (__skb->gso_segs > GSO_MAX_SEGS) + return -EINVAL; + skb_shinfo(skb)->gso_segs = __skb->gso_segs; + return 0; } @@ -285,9 +308,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!__skb) return; + __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); + __skb->wire_len = cb->pkt_len; + __skb->gso_segs = skb_shinfo(skb)->gso_segs; } int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, @@ -359,7 +385,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; - ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { @@ -416,8 +442,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - - ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + bpf_prog_change_xdp(NULL, prog); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || @@ -425,6 +451,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: + bpf_prog_change_xdp(prog, NULL); kfree(data); return ret; } @@ -437,8 +464,7 @@ static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) /* flags is allowed */ - if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) + - sizeof_field(struct bpf_flow_keys, flags), + if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index a0a54482aabc..60136575aea4 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1607,6 +1607,19 @@ static int br_fill_linkxstats(struct sk_buff *skb, br_multicast_get_stats(br, p, nla_data(nla)); } #endif + + if (p) { + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, + sizeof(p->stp_xstats), + BRIDGE_XSTATS_PAD); + if (!nla) + goto nla_put_failure; + + spin_lock_bh(&br->lock); + memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); + spin_unlock_bh(&br->lock); + } + nla_nest_end(skb, nest); *prividx = 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 36b0367ca1e0..f540f3bdf294 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -283,6 +283,8 @@ struct net_bridge_port { #endif u16 group_fwd_mask; u16 backup_redirected_cnt; + + struct bridge_stp_xstats stp_xstats; }; #define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f1410f8d312..6856a6d9282b 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -45,6 +45,17 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) br_info(p->br, "port %u(%s) entered %s state\n", (unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]); + + if (p->br->stp_enabled == BR_KERNEL_STP) { + switch (p->state) { + case BR_STATE_BLOCKING: + p->stp_xstats.transition_blk++; + break; + case BR_STATE_FORWARDING: + p->stp_xstats.transition_fwd++; + break; + } + } } /* called under bridge lock */ @@ -484,6 +495,8 @@ void br_received_config_bpdu(struct net_bridge_port *p, struct net_bridge *br; int was_root; + p->stp_xstats.rx_bpdu++; + br = p->br; was_root = br_is_root_bridge(br); @@ -517,6 +530,8 @@ void br_received_config_bpdu(struct net_bridge_port *p, /* called under bridge lock */ void br_received_tcn_bpdu(struct net_bridge_port *p) { + p->stp_xstats.rx_tcn++; + if (br_is_designated_port(p)) { br_info(p->br, "port %u(%s) received tcn bpdu\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 7796dd9d42d7..0e4572f31330 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -118,6 +118,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) br_set_ticks(buf+33, bpdu->forward_delay); br_send_bpdu(p, buf, 35); + + p->stp_xstats.tx_bpdu++; } /* called under bridge lock */ @@ -133,6 +135,8 @@ void br_send_tcn_bpdu(struct net_bridge_port *p) buf[2] = 0; buf[3] = BPDU_TYPE_TCN; br_send_bpdu(p, buf, 4); + + p->stp_xstats.tx_tcn++; } /* diff --git a/net/core/Makefile b/net/core/Makefile index a104dc8faafc..3e2c378e5f31 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -8,7 +8,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o diff --git a/net/core/dev.c b/net/core/dev.c index 0ad39c87b7fd..d99f88c58636 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4932,7 +4932,6 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { -#ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { int ingress_retval; @@ -4946,7 +4945,6 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, rcu_read_unlock(); return ingress_retval; } -#endif /* CONFIG_NETFILTER_INGRESS */ return 0; } @@ -8542,7 +8540,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { + bool non_hw = !(flags & XDP_FLAGS_HW_MODE); + struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; + int err; + + if (non_hw) { + prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, + XDP_QUERY_PROG)); + if (IS_ERR(prev_prog)) + prev_prog = NULL; + } memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -8553,7 +8561,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, xdp.flags = flags; xdp.prog = prog; - return bpf_op(dev, &xdp); + err = bpf_op(dev, &xdp); + if (!err && non_hw) + bpf_prog_change_xdp(prev_prog, prog); + + if (prev_prog) + bpf_prog_put(prev_prog); + + return err; } static void dev_xdp_uninstall(struct net_device *dev) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5163d900bb4f..dbaebbe573f0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -187,6 +187,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 4c63c9a4c09e..d30aa47052aa 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4844,21 +4844,12 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { - if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && - state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) - return; - - if (reporter->health_state == state) - return; - - reporter->health_state = state; - trace_devlink_health_reporter_state_update(reporter->devlink, - reporter->ops->name, state); + reporter->recovery_count++; + reporter->last_recovery_ts = jiffies; } -EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); +EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, @@ -4876,9 +4867,8 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, if (err) return err; - reporter->recovery_count++; + devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - reporter->last_recovery_ts = jiffies; return 0; } @@ -5090,6 +5080,48 @@ genlmsg_cancel: return -EMSGSIZE; } +static void devlink_recover_notify(struct devlink_health_reporter *reporter, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_health_reporter_fill(msg, reporter->devlink, + reporter, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(reporter->devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void +devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state state) +{ + if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && + state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + return; + + if (reporter->health_state == state) + return; + + reporter->health_state = state; + trace_devlink_health_reporter_state_update(reporter->devlink, + reporter->ops->name, state); + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); + static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) { diff --git a/net/core/filter.c b/net/core/filter.c index 28b3c258188c..42fd17c48c5f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3511,36 +3511,16 @@ err: } static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) + struct bpf_map *map, struct xdp_buff *xdp) { - int err; - switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: { - struct bpf_dtab_netdev *dst = fwd; - - err = dev_map_enqueue(dst, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_CPUMAP: { - struct bpf_cpu_map_entry *rcpu = fwd; - - err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_XSKMAP: { - struct xdp_sock *xs = fwd; - - err = __xsk_map_redirect(map, xdp, xs); - return err; - } + case BPF_MAP_TYPE_DEVMAP_HASH: + return dev_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_CPUMAP: + return cpu_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_redirect(fwd, xdp); default: break; } @@ -3549,26 +3529,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = ri->map_to_flush; - - ri->map_to_flush = NULL; - if (map) { - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - __dev_map_flush(map); - break; - case BPF_MAP_TYPE_CPUMAP: - __cpu_map_flush(map); - break; - case BPF_MAP_TYPE_XSKMAP: - __xsk_map_flush(map); - break; - default: - break; - } - } + __dev_map_flush(); + __cpu_map_flush(); + __xsk_map_flush(); } EXPORT_SYMBOL_GPL(xdp_do_flush_map); @@ -3617,14 +3580,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) - xdp_do_flush_map(); - - err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); if (unlikely(err)) goto err; - ri->map_to_flush = map; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: @@ -8941,3 +8900,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ + +DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp) + +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) +{ + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp), + prev_prog, prog); +} diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a6aefe989043..9b7cbe35df37 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -96,10 +96,65 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); +static void __page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, + bool refill) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + int pref_nid; /* preferred NUMA node */ + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. + */ +#ifdef CONFIG_NUMA + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif + + /* Slower-path: Get pages from locked ring queue */ + spin_lock(&r->consumer_lock); + + /* Refill alloc array, but only if NUMA match */ + do { + page = __ptr_ring_consume(r); + if (unlikely(!page)) + break; + + if (likely(page_to_nid(page) == pref_nid)) { + pool->alloc.cache[pool->alloc.count++] = page; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + __page_pool_return_page(pool, page); + page = NULL; + break; + } + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && + refill); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + + spin_unlock(&r->consumer_lock); + return page; +} + /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - struct ptr_ring *r = &pool->ring; bool refill = false; struct page *page; @@ -113,20 +168,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) refill = true; } - /* Quicker fallback, avoid locks when ring is empty */ - if (__ptr_ring_empty(r)) - return NULL; - - /* Slow-path: Get page from locked ring queue, - * refill alloc array if requested. - */ - spin_lock(&r->consumer_lock); - page = __ptr_ring_consume(r); - if (refill) - pool->alloc.count = __ptr_ring_consume_batched(r, - pool->alloc.cache, - PP_ALLOC_CACHE_REFILL); - spin_unlock(&r->consumer_lock); + page = page_pool_refill_alloc_cache(pool, refill); return page; } @@ -163,7 +205,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, */ /* Cache was empty, do real allocation */ +#ifdef CONFIG_NUMA page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); +#else + page = alloc_pages(gfp, pool->p.order); +#endif if (!page) return NULL; @@ -311,13 +357,10 @@ static bool __page_pool_recycle_direct(struct page *page, /* page is NOT reusable when: * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - * 2) belongs to a different NUMA node than pool->p.nid. - * - * To update pool->p.nid users must call page_pool_update_nid. */ static bool pool_page_reusable(struct page_pool *pool, struct page *page) { - return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; + return !page_is_pfmemalloc(page); } void __page_pool_put_page(struct page_pool *pool, struct page *page, @@ -484,7 +527,15 @@ EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { + struct page *page; + trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; + + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 02916f43bf63..20bc406f3871 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1041,6 +1041,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + 0; } @@ -1757,6 +1758,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; + if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && + nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) + goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) @@ -1822,6 +1826,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, + [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 973a71f4bc89..44b0894d8ae1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5472,12 +5472,15 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, } /** - * skb_mpls_push() - push a new MPLS header after the mac header + * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of + * the packet * * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is + * ethernet * * Expects skb->data at mac header. * @@ -5501,7 +5504,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, return err; if (!skb->inner_protocol) { - skb_set_inner_network_header(skb, mac_len); + skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); } @@ -5510,6 +5513,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, mac_len); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); + skb_reset_mac_len(skb); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; @@ -5529,7 +5533,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); * @skb: buffer * @next_proto: ethertype of header after popped MPLS header * @mac_len: length of the MAC header - * @ethernet: flag to indicate if ethernet header is present in packet + * @ethernet: flag to indicate if the packet is ethernet * * Expects skb->data at mac header. * diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 7911235706a9..04840697fe79 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -13,7 +13,7 @@ static unsigned int classify(const struct sk_buff *skb) { if (likely(skb->dev && skb->dev->phydev && - skb->dev->phydev->drv)) + skb->dev->phydev->mii_ts)) return ptp_classify_raw(skb); else return PTP_CLASS_NONE; @@ -21,7 +21,7 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { - struct phy_device *phydev; + struct mii_timestamper *mii_ts; struct sk_buff *clone; unsigned int type; @@ -32,22 +32,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return; - phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) return; - phydev->drv->txtstamp(phydev, clone, type); + mii_ts->txtstamp(mii_ts, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { - struct phy_device *phydev; + struct mii_timestamper *mii_ts; unsigned int type; - if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) return false; if (skb_headroom(skb) < ETH_HLEN) @@ -62,9 +62,9 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->rxtstamp)) + return mii_ts->rxtstamp(mii_ts, skb, type); return false; } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 1e6c3cac11e6..92663dcb3aa2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -29,6 +29,12 @@ config NET_DSA_TAG_8021Q Drivers which use these helpers should select this as dependency. +config NET_DSA_TAG_AR9331 + tristate "Tag driver for Atheros AR9331 SoC with built-in switch" + help + Say Y or M if you want to enable support for tagging frames for + the Atheros AR9331 SoC with built-in switch. + config NET_DSA_TAG_BRCM_COMMON tristate default n diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 9a482c38bdb1..108486cfdeef 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -5,6 +5,7 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o # tagging formats obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o +obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c66abbed4daf..c6d81f2baf4e 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -614,6 +614,32 @@ static int dsa_port_parse_dsa(struct dsa_port *dp) return 0; } +static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, + struct net_device *master) +{ + enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE; + struct dsa_switch *mds, *ds = dp->ds; + unsigned int mdp_upstream; + struct dsa_port *mdp; + + /* It is possible to stack DSA switches onto one another when that + * happens the switch driver may want to know if its tagging protocol + * is going to work in such a configuration. + */ + if (dsa_slave_dev_check(master)) { + mdp = dsa_slave_to_port(master); + mds = mdp->ds; + mdp_upstream = dsa_upstream_port(mds, mdp->index); + tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream, + DSA_TAG_PROTO_NONE); + } + + /* If the master device is not itself a DSA slave in a disjoint DSA + * tree, then return immediately. + */ + return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); +} + static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; @@ -621,20 +647,21 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); + tag_protocol = dsa_get_tag_protocol(dp, master); tag_ops = dsa_tag_driver_get(tag_protocol); if (IS_ERR(tag_ops)) { if (PTR_ERR(tag_ops) == -ENOPROTOOPT) return -EPROBE_DEFER; dev_warn(ds->dev, "No tagger for this switch\n"); + dp->master = NULL; return PTR_ERR(tag_ops); } + dp->master = master; dp->type = DSA_PORT_TYPE_CPU; dp->filter = tag_ops->filter; dp->rcv = tag_ops->rcv; dp->tag_ops = tag_ops; - dp->master = master; dp->dst = dst; return 0; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2dd86d9bcda9..a7662e7a691d 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -150,22 +150,6 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); -void dsa_port_phylink_validate(struct phylink_config *config, - unsigned long *supported, - struct phylink_link_state *state); -void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state); -void dsa_port_phylink_mac_config(struct phylink_config *config, - unsigned int mode, - const struct phylink_link_state *state); -void dsa_port_phylink_mac_an_restart(struct phylink_config *config); -void dsa_port_phylink_mac_link_down(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface); -void dsa_port_phylink_mac_link_up(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; /* slave.c */ @@ -173,13 +157,12 @@ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); +bool dsa_slave_dev_check(const struct net_device *dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev); - static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); diff --git a/net/dsa/master.c b/net/dsa/master.c index 3255dfc97f86..bd44bde272f4 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -197,6 +197,35 @@ static int dsa_master_get_phys_port_name(struct net_device *dev, return 0; } +static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch *ds = cpu_dp->ds; + struct dsa_switch_tree *dst; + int err = -EOPNOTSUPP; + struct dsa_port *dp; + + dst = ds->dst; + + switch (cmd) { + case SIOCGHWTSTAMP: + case SIOCSHWTSTAMP: + /* Deny PTP operations on master if there is at least one + * switch in the tree that is PTP capable. + */ + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->ops->port_hwtstamp_get || + dp->ds->ops->port_hwtstamp_set) + return -EBUSY; + break; + } + + if (cpu_dp->orig_ndo_ops && cpu_dp->orig_ndo_ops->ndo_do_ioctl) + err = cpu_dp->orig_ndo_ops->ndo_do_ioctl(dev, ifr, cmd); + + return err; +} + static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -249,6 +278,7 @@ static int dsa_master_ndo_setup(struct net_device *dev) memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops)); ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; + ops->ndo_do_ioctl = dsa_master_ioctl; dev->netdev_ops = ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 46ac9ba21987..774facb8d547 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -415,9 +415,9 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return phydev; } -void dsa_port_phylink_validate(struct phylink_config *config, - unsigned long *supported, - struct phylink_link_state *state) +static void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -427,10 +427,9 @@ void dsa_port_phylink_validate(struct phylink_config *config, ds->ops->phylink_validate(ds, dp->index, supported, state); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); -void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state) +static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -444,11 +443,10 @@ void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0) state->link = 0; } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_pcs_get_state); -void dsa_port_phylink_mac_config(struct phylink_config *config, - unsigned int mode, - const struct phylink_link_state *state) +static void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -458,9 +456,8 @@ void dsa_port_phylink_mac_config(struct phylink_config *config, ds->ops->phylink_mac_config(ds, dp->index, mode, state); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config); -void dsa_port_phylink_mac_an_restart(struct phylink_config *config) +static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -470,11 +467,10 @@ void dsa_port_phylink_mac_an_restart(struct phylink_config *config) ds->ops->phylink_mac_an_restart(ds, dp->index); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart); -void dsa_port_phylink_mac_link_down(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface) +static void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct phy_device *phydev = NULL; @@ -491,12 +487,11 @@ void dsa_port_phylink_mac_link_down(struct phylink_config *config, ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down); -void dsa_port_phylink_mac_link_up(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev) +static void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -509,7 +504,6 @@ void dsa_port_phylink_mac_link_up(struct phylink_config *config, ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, @@ -605,6 +599,7 @@ static int dsa_port_phylink_register(struct dsa_port *dp) dp->pl_config.dev = ds->dev; dp->pl_config.type = PHYLINK_DEV; + dp->pl_config.pcs_poll = ds->pcs_poll; dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 78ffc87dc25e..088c886e609e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,8 +22,6 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(const struct net_device *dev); - /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -116,9 +114,6 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - phylink_stop(dp->pl); dsa_port_disable(dp); @@ -518,7 +513,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); - DSA_SKB_CB(skb)->deferred_xmit = false; DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the @@ -531,39 +525,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ nskb = p->xmit(skb, dev); if (!nskb) { - if (!DSA_SKB_CB(skb)->deferred_xmit) - kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - DSA_SKB_CB(skb)->deferred_xmit = true; - - skb_queue_tail(&dp->xmit_queue, skb); - schedule_work(&dp->xmit_work); - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_defer_xmit); - -static void dsa_port_xmit_work(struct work_struct *work) -{ - struct dsa_port *dp = container_of(work, struct dsa_port, xmit_work); - struct dsa_switch *ds = dp->ds; - struct sk_buff *skb; - - if (unlikely(!ds->ops->port_deferred_xmit)) - return; - - while ((skb = skb_dequeue(&dp->xmit_queue)) != NULL) - ds->ops->port_deferred_xmit(ds, dp->index, skb); -} - /* ethtool operations *******************************************************/ static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -1367,9 +1335,6 @@ int dsa_slave_suspend(struct net_device *slave_dev) if (!netif_running(slave_dev)) return 0; - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - netif_device_detach(slave_dev); rtnl_lock(); @@ -1455,8 +1420,6 @@ int dsa_slave_create(struct dsa_port *port) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - INIT_WORK(&port->xmit_work, dsa_port_xmit_work); - skb_queue_head_init(&port->xmit_queue); p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; @@ -1508,7 +1471,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(const struct net_device *dev) +bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c new file mode 100644 index 000000000000..466ffa92a474 --- /dev/null +++ b/net/dsa/tag_ar9331.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + */ + + +#include <linux/bitfield.h> +#include <linux/etherdevice.h> + +#include "dsa_priv.h" + +#define AR9331_HDR_LEN 2 +#define AR9331_HDR_VERSION 1 + +#define AR9331_HDR_VERSION_MASK GENMASK(15, 14) +#define AR9331_HDR_PRIORITY_MASK GENMASK(13, 12) +#define AR9331_HDR_TYPE_MASK GENMASK(10, 8) +#define AR9331_HDR_BROADCAST BIT(7) +#define AR9331_HDR_FROM_CPU BIT(6) +/* AR9331_HDR_RESERVED - not used or may be version field. + * According to the AR8216 doc it should 0b10. On AR9331 it is 0b11 on RX path + * and should be set to 0b11 to make it work. + */ +#define AR9331_HDR_RESERVED_MASK GENMASK(5, 4) +#define AR9331_HDR_PORT_NUM_MASK GENMASK(3, 0) + +static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + __le16 *phdr; + u16 hdr; + + if (skb_cow_head(skb, 0) < 0) + return NULL; + + phdr = skb_push(skb, AR9331_HDR_LEN); + + hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION); + hdr |= AR9331_HDR_FROM_CPU | dp->index; + /* 0b10 for AR8216 and 0b11 for AR9331 */ + hdr |= AR9331_HDR_RESERVED_MASK; + + phdr[0] = cpu_to_le16(hdr); + + return skb; +} + +static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt) +{ + u8 ver, port; + u16 hdr; + + if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN))) + return NULL; + + hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb)); + + ver = FIELD_GET(AR9331_HDR_VERSION_MASK, hdr); + if (unlikely(ver != AR9331_HDR_VERSION)) { + netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n", + __func__, __LINE__, hdr); + return NULL; + } + + if (unlikely(hdr & AR9331_HDR_FROM_CPU)) { + netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n", + __func__, __LINE__, hdr); + return NULL; + } + + skb_pull_rcsum(skb, AR9331_HDR_LEN); + + /* Get source port information */ + port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr); + + skb->dev = dsa_master_find_slave(ndev, 0, port); + if (!skb->dev) + return NULL; + + return skb; +} + +static const struct dsa_device_ops ar9331_netdev_ops = { + .name = "ar9331", + .proto = DSA_TAG_PROTO_AR9331, + .xmit = ar9331_tag_xmit, + .rcv = ar9331_tag_rcv, + .overhead = AR9331_HDR_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331); +module_dsa_tag_driver(ar9331_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 63ef2a14c934..5366ea430349 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -83,12 +83,24 @@ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) return false; } +/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ +static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, + struct sk_buff *skb) +{ + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + skb_queue_tail(&sp->xmit_queue, skb_get(skb)); + kthread_queue_work(sp->xmit_worker, &sp->xmit_work); + + return NULL; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - struct dsa_switch *ds = dp->ds; - u16 tx_vid = dsa_8021q_tx_vid(ds, dp->index); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); @@ -97,7 +109,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, * is the .port_deferred_xmit driver callback. */ if (unlikely(sja1105_is_link_local(skb))) - return dsa_defer_xmit(skb, netdev); + return sja1105_defer_xmit(dp->priv, skb); /* If we are under a vlan_filtering bridge, IP termination on * switch ports based on 802.1Q tags is simply too brittle to diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile new file mode 100644 index 000000000000..9a1332fb0cc6 --- /dev/null +++ b/net/ethtool/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-y += ioctl.o common.o + +obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o + +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ + linkstate.o diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c new file mode 100644 index 000000000000..fce45dac4205 --- /dev/null +++ b/net/ethtool/bitset.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool_netlink.h> +#include <linux/bitmap.h> +#include "netlink.h" +#include "bitset.h" + +/* Some bitmaps are internally represented as an array of unsigned long, some + * as an array of u32 (some even as single u32 for now). To avoid the need of + * wrappers on caller side, we provide two set of functions: those with "32" + * suffix in their names expect u32 based bitmaps, those without it expect + * unsigned long bitmaps. + */ + +static u32 ethnl_lower_bits(unsigned int n) +{ + return ~(u32)0 >> (32 - n % 32); +} + +static u32 ethnl_upper_bits(unsigned int n) +{ + return ~(u32)0 << (n % 32); +} + +/** + * ethnl_bitmap32_clear() - Clear u32 based bitmap + * @dst: bitmap to clear + * @start: beginning of the interval + * @end: end of the interval + * @mod: set if bitmap was modified + * + * Clear @nbits bits of a bitmap with indices @start <= i < @end + */ +static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, + bool *mod) +{ + unsigned int start_word = start / 32; + unsigned int end_word = end / 32; + unsigned int i; + u32 mask; + + if (end <= start) + return; + + if (start % 32) { + mask = ethnl_upper_bits(start); + if (end_word == start_word) { + mask &= ethnl_lower_bits(end); + if (dst[start_word] & mask) { + dst[start_word] &= ~mask; + *mod = true; + } + return; + } + if (dst[start_word] & mask) { + dst[start_word] &= ~mask; + *mod = true; + } + start_word++; + } + + for (i = start_word; i < end_word; i++) { + if (dst[i]) { + dst[i] = 0; + *mod = true; + } + } + if (end % 32) { + mask = ethnl_lower_bits(end); + if (dst[end_word] & mask) { + dst[end_word] &= ~mask; + *mod = true; + } + } +} + +/** + * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval + * @map: bitmap to test + * @start: beginning of the interval + * @end: end of the interval + * + * Return: true if there is non-zero bit with index @start <= i < @end, + * false if the whole interval is zero + */ +static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, + unsigned int end) +{ + unsigned int start_word = start / 32; + unsigned int end_word = end / 32; + u32 mask; + + if (end <= start) + return true; + + if (start % 32) { + mask = ethnl_upper_bits(start); + if (end_word == start_word) { + mask &= ethnl_lower_bits(end); + return map[start_word] & mask; + } + if (map[start_word] & mask) + return true; + start_word++; + } + + if (!memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) + return true; + if (end % 32 == 0) + return true; + return map[end_word] & ethnl_lower_bits(end); +} + +/** + * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask + * pair + * @dst: bitmap to update + * @nbits: bit size of the bitmap + * @value: values to set + * @mask: mask of bits to set + * @mod: set to true if bitmap is modified, preserve if not + * + * Set bits in @dst bitmap which are set in @mask to values from @value, leave + * the rest untouched. If destination bitmap was modified, set @mod to true, + * leave as it is if not. + */ +static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, + const u32 *value, const u32 *mask, bool *mod) +{ + while (nbits > 0) { + u32 real_mask = mask ? *mask : ~(u32)0; + u32 new_value; + + if (nbits < 32) + real_mask &= ethnl_lower_bits(nbits); + new_value = (*dst & ~real_mask) | (*value & real_mask); + if (new_value != *dst) { + *dst = new_value; + *mod = true; + } + + if (nbits <= 32) + break; + dst++; + nbits -= 32; + value++; + if (mask) + mask++; + } +} + +static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) +{ + return map[index / 32] & (1U << (index % 32)); +} + +/** + * ethnl_bitset32_size() - Calculate size of bitset nested attribute + * @val: value bitmap (u32 based) + * @mask: mask bitmap (u32 based, optional) + * @nbits: bit length of the bitset + * @names: array of bit names (optional) + * @compact: assume compact format for output + * + * Estimate length of netlink attribute composed by a later call to + * ethnl_put_bitset32() call with the same arguments. + * + * Return: negative error code or attribute length estimate + */ +int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact) +{ + unsigned int len = 0; + + /* list flag */ + if (!mask) + len += nla_total_size(sizeof(u32)); + /* size */ + len += nla_total_size(sizeof(u32)); + + if (compact) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + /* value, mask */ + len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); + } else { + unsigned int bits_len = 0; + unsigned int bit_len, i; + + for (i = 0; i < nbits; i++) { + const char *name = names ? names[i] : NULL; + + if (!ethnl_bitmap32_test_bit(mask ?: val, i)) + continue; + /* index */ + bit_len = nla_total_size(sizeof(u32)); + /* name */ + if (name) + bit_len += ethnl_strz_size(name); + /* value */ + if (mask && ethnl_bitmap32_test_bit(val, i)) + bit_len += nla_total_size(0); + + /* bit nest */ + bits_len += nla_total_size(bit_len); + } + /* bits nest */ + len += nla_total_size(bits_len); + } + + /* outermost nest */ + return nla_total_size(len); +} + +/** + * ethnl_put_bitset32() - Put a bitset nest into a message + * @skb: skb with the message + * @attrtype: attribute type for the bitset nest + * @val: value bitmap (u32 based) + * @mask: mask bitmap (u32 based, optional) + * @nbits: bit length of the bitset + * @names: array of bit names (optional) + * @compact: use compact format for the output + * + * Compose a nested attribute representing a bitset. If @mask is null, simple + * bitmap (bit list) is created, if @mask is provided, represent a value/mask + * pair. Bit names are only used in verbose mode and when provided by calller. + * + * Return: 0 on success, negative error value on error + */ +int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, + const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact) +{ + struct nlattr *nest; + struct nlattr *attr; + + nest = nla_nest_start(skb, attrtype); + if (!nest) + return -EMSGSIZE; + + if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) + goto nla_put_failure; + if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) + goto nla_put_failure; + if (compact) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + unsigned int nbytes = nwords * sizeof(u32); + u32 *dst; + + attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); + if (!attr) + goto nla_put_failure; + dst = nla_data(attr); + memcpy(dst, val, nbytes); + if (nbits % 32) + dst[nwords - 1] &= ethnl_lower_bits(nbits); + + if (mask) { + attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); + if (!attr) + goto nla_put_failure; + dst = nla_data(attr); + memcpy(dst, mask, nbytes); + if (nbits % 32) + dst[nwords - 1] &= ethnl_lower_bits(nbits); + } + } else { + struct nlattr *bits; + unsigned int i; + + bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); + if (!bits) + goto nla_put_failure; + for (i = 0; i < nbits; i++) { + const char *name = names ? names[i] : NULL; + + if (!ethnl_bitmap32_test_bit(mask ?: val, i)) + continue; + attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); + if (!attr) + goto nla_put_failure; + if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) + goto nla_put_failure; + if (name && + ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) + goto nla_put_failure; + if (mask && ethnl_bitmap32_test_bit(val, i) && + nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) + goto nla_put_failure; + nla_nest_end(skb, attr); + } + nla_nest_end(skb, bits); + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = { + [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, + [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, + [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, + [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, +}; + +static const struct nla_policy bit_policy[ETHTOOL_A_BITSET_BIT_MAX + 1] = { + [ETHTOOL_A_BITSET_BIT_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, + [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, +}; + +/** + * ethnl_bitset_is_compact() - check if bitset attribute represents a compact + * bitset + * @bitset: nested attribute representing a bitset + * @compact: pointer for return value + * + * Return: 0 on success, negative error code on failure + */ +int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, bitset, + bitset_policy, NULL); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BITS]) { + if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) + return -EINVAL; + *compact = false; + return 0; + } + if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) + return -EINVAL; + + *compact = true; + return 0; +} + +/** + * ethnl_name_to_idx() - look up string index for a name + * @names: array of ETH_GSTRING_LEN sized strings + * @n_names: number of strings in the array + * @name: name to look up + * + * Return: index of the string if found, -ENOENT if not found + */ +static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, + const char *name) +{ + unsigned int i; + + if (!names) + return -ENOENT; + + for (i = 0; i < n_names; i++) { + /* names[i] may not be null terminated */ + if (!strncmp(names[i], name, ETH_GSTRING_LEN) && + strlen(name) <= ETH_GSTRING_LEN) + return i; + } + + return -ENOENT; +} + +static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, + const struct nlattr *bit_attr, bool no_mask, + ethnl_string_array_t names, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_BIT_MAX + 1]; + int ret, idx; + + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_BIT_MAX, bit_attr, + bit_policy, extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { + const char *name; + + idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); + if (idx >= nbits) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_BITSET_BIT_INDEX], + "bit index too high"); + return -EOPNOTSUPP; + } + name = names ? names[idx] : NULL; + if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && + strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, + nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "bit index and name mismatch"); + return -EINVAL; + } + } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { + idx = ethnl_name_to_idx(names, nbits, + nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); + if (idx < 0) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_BITSET_BIT_NAME], + "bit name not found"); + return -EOPNOTSUPP; + } + } else { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "neither bit index nor name specified"); + return -EINVAL; + } + + *index = idx; + *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; + return 0; +} + +static int +ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, struct nlattr **tb, + ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + struct nlattr *bit_attr; + bool no_mask; + int rem; + int ret; + + if (tb[ETHTOOL_A_BITSET_VALUE]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], + "value only allowed in compact bitset"); + return -EINVAL; + } + if (tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "mask only allowed in compact bitset"); + return -EINVAL; + } + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + + nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { + bool old_val, new_val; + unsigned int idx; + + if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); + return -EINVAL; + } + ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, + names, extack); + if (ret < 0) + return ret; + old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); + if (new_val != old_val) { + if (new_val) + bitmap[idx / 32] |= ((u32)1 << (idx % 32)); + else + bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); + *mod = true; + } + } + + return 0; +} + +static int ethnl_compact_sanity_checks(unsigned int nbits, + const struct nlattr *nest, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + unsigned int attr_nbits, attr_nwords; + const struct nlattr *test_attr; + + if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "mask not allowed in list bitset"); + return -EINVAL; + } + if (!tb[ETHTOOL_A_BITSET_SIZE]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing size in compact bitset"); + return -EINVAL; + } + if (!tb[ETHTOOL_A_BITSET_VALUE]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing value in compact bitset"); + return -EINVAL; + } + if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing mask in compact nonlist bitset"); + return -EINVAL; + } + + attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); + attr_nwords = DIV_ROUND_UP(attr_nbits, 32); + if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], + "bitset value length does not match size"); + return -EINVAL; + } + if (tb[ETHTOOL_A_BITSET_MASK] && + nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "bitset mask length does not match size"); + return -EINVAL; + } + if (attr_nbits <= nbits) + return 0; + + test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : + tb[ETHTOOL_A_BITSET_MASK]; + if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { + NL_SET_ERR_MSG_ATTR(extack, test_attr, + "cannot modify bits past kernel bitset size"); + return -EINVAL; + } + return 0; +} + +/** + * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap + * @bitmap: bitmap to update + * @nbits: size of the updated bitmap in bits + * @attr: nest attribute to parse and apply + * @names: array of bit names; may be null for compact format + * @extack: extack for error reporting + * @mod: set this to true if bitmap is modified, leave as it is if not + * + * Apply bitset netsted attribute to a bitmap. If the attribute represents + * a bit list, @bitmap is set to its contents; otherwise, bits in mask are + * set to values from value. Bitmaps in the attribute may be longer than + * @nbits but the message must not request modifying any bits past @nbits. + * + * Return: negative error code on failure, 0 on success + */ +int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1]; + unsigned int change_bits; + bool no_mask; + int ret; + + if (!attr) + return 0; + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy, + extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BITS]) + return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, + names, extack, mod); + ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); + if (ret < 0) + return ret; + + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + change_bits = min_t(unsigned int, + nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); + ethnl_bitmap32_update(bitmap, change_bits, + nla_data(tb[ETHTOOL_A_BITSET_VALUE]), + no_mask ? NULL : + nla_data(tb[ETHTOOL_A_BITSET_MASK]), + mod); + if (no_mask && change_bits < nbits) + ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); + + return 0; +} + +#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) + +/* 64-bit big endian architectures are the only case when u32 based bitmaps + * and unsigned long based bitmaps have different memory layout so that we + * cannot simply cast the latter to the former and need actual wrappers + * converting the latter to the former. + * + * To reduce the number of slab allocations, the wrappers use fixed size local + * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the + * majority of bitmaps used by ethtool. + */ +#define ETHNL_SMALL_BITMAP_BITS 128 +#define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) + +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; + u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *mask32; + u32 *val32; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); + if (!val32) + return -ENOMEM; + mask32 = val32 + nwords; + } else { + val32 = small_val32; + mask32 = small_mask32; + } + + bitmap_to_arr32(val32, val, nbits); + if (mask) + bitmap_to_arr32(mask32, mask, nbits); + else + mask32 = NULL; + ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(val32); + + return ret; +} + +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; + u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *mask32; + u32 *val32; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); + if (!val32) + return -ENOMEM; + mask32 = val32 + nwords; + } else { + val32 = small_val32; + mask32 = small_mask32; + } + + bitmap_to_arr32(val32, val, nbits); + if (mask) + bitmap_to_arr32(mask32, mask, nbits); + else + mask32 = NULL; + ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, + compact); + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(val32); + + return ret; +} + +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *bitmap32 = small_bitmap32; + bool u32_mod = false; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int dst_words = DIV_ROUND_UP(nbits, 32); + + bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); + if (!bitmap32) + return -ENOMEM; + } + + bitmap_to_arr32(bitmap32, bitmap, nbits); + ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, + &u32_mod); + if (u32_mod) { + bitmap_from_arr32(bitmap, bitmap32, nbits); + *mod = true; + } + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(bitmap32); + + return ret; +} + +#else + +/* On little endian 64-bit and all 32-bit architectures, an unsigned long + * based bitmap can be interpreted as u32 based one using a simple cast. + */ + +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, + names, compact); +} + +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, + (const u32 *)mask, nbits, names, compact); +} + +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, + mod); +} + +#endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */ diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h new file mode 100644 index 000000000000..b8247e34109d --- /dev/null +++ b/net/ethtool/bitset.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_BITSET_H +#define _NET_ETHTOOL_BITSET_H + +typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN]; + +int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact); +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact); +int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact); +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact); +int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, + const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact); +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod); +int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod); + +#endif /* _NET_ETHTOOL_BITSET_H */ diff --git a/net/ethtool/common.c b/net/ethtool/common.c new file mode 100644 index 000000000000..e621b1694d2f --- /dev/null +++ b/net/ethtool/common.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "common.h" + +const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", + [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", + [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", + [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", + [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", + [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", + [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", + [NETIF_F_RXFCS_BIT] = "rx-fcs", + [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", + [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", + [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", + [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", + [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", +}; + +const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", + [ETH_RSS_HASH_CRC32_BIT] = "crc32", +}; + +const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", +}; + +const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", + [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", + [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", +}; + +#define __LINK_MODE_NAME(speed, type, duplex) \ + #speed "base" #type "/" #duplex +#define __DEFINE_LINK_MODE_NAME(speed, type, duplex) \ + [ETHTOOL_LINK_MODE(speed, type, duplex)] = \ + __LINK_MODE_NAME(speed, type, duplex) +#define __DEFINE_SPECIAL_MODE_NAME(_mode, _name) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = _name + +const char link_mode_names[][ETH_GSTRING_LEN] = { + __DEFINE_LINK_MODE_NAME(10, T, Half), + __DEFINE_LINK_MODE_NAME(10, T, Full), + __DEFINE_LINK_MODE_NAME(100, T, Half), + __DEFINE_LINK_MODE_NAME(100, T, Full), + __DEFINE_LINK_MODE_NAME(1000, T, Half), + __DEFINE_LINK_MODE_NAME(1000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(Autoneg, "Autoneg"), + __DEFINE_SPECIAL_MODE_NAME(TP, "TP"), + __DEFINE_SPECIAL_MODE_NAME(AUI, "AUI"), + __DEFINE_SPECIAL_MODE_NAME(MII, "MII"), + __DEFINE_SPECIAL_MODE_NAME(FIBRE, "FIBRE"), + __DEFINE_SPECIAL_MODE_NAME(BNC, "BNC"), + __DEFINE_LINK_MODE_NAME(10000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(Pause, "Pause"), + __DEFINE_SPECIAL_MODE_NAME(Asym_Pause, "Asym_Pause"), + __DEFINE_LINK_MODE_NAME(2500, X, Full), + __DEFINE_SPECIAL_MODE_NAME(Backplane, "Backplane"), + __DEFINE_LINK_MODE_NAME(1000, KX, Full), + __DEFINE_LINK_MODE_NAME(10000, KX4, Full), + __DEFINE_LINK_MODE_NAME(10000, KR, Full), + __DEFINE_SPECIAL_MODE_NAME(10000baseR_FEC, "10000baseR_FEC"), + __DEFINE_LINK_MODE_NAME(20000, MLD2, Full), + __DEFINE_LINK_MODE_NAME(20000, KR2, Full), + __DEFINE_LINK_MODE_NAME(40000, KR4, Full), + __DEFINE_LINK_MODE_NAME(40000, CR4, Full), + __DEFINE_LINK_MODE_NAME(40000, SR4, Full), + __DEFINE_LINK_MODE_NAME(40000, LR4, Full), + __DEFINE_LINK_MODE_NAME(56000, KR4, Full), + __DEFINE_LINK_MODE_NAME(56000, CR4, Full), + __DEFINE_LINK_MODE_NAME(56000, SR4, Full), + __DEFINE_LINK_MODE_NAME(56000, LR4, Full), + __DEFINE_LINK_MODE_NAME(25000, CR, Full), + __DEFINE_LINK_MODE_NAME(25000, KR, Full), + __DEFINE_LINK_MODE_NAME(25000, SR, Full), + __DEFINE_LINK_MODE_NAME(50000, CR2, Full), + __DEFINE_LINK_MODE_NAME(50000, KR2, Full), + __DEFINE_LINK_MODE_NAME(100000, KR4, Full), + __DEFINE_LINK_MODE_NAME(100000, SR4, Full), + __DEFINE_LINK_MODE_NAME(100000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_NAME(50000, SR2, Full), + __DEFINE_LINK_MODE_NAME(1000, X, Full), + __DEFINE_LINK_MODE_NAME(10000, CR, Full), + __DEFINE_LINK_MODE_NAME(10000, SR, Full), + __DEFINE_LINK_MODE_NAME(10000, LR, Full), + __DEFINE_LINK_MODE_NAME(10000, LRM, Full), + __DEFINE_LINK_MODE_NAME(10000, ER, Full), + __DEFINE_LINK_MODE_NAME(2500, T, Full), + __DEFINE_LINK_MODE_NAME(5000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(FEC_NONE, "None"), + __DEFINE_SPECIAL_MODE_NAME(FEC_RS, "RS"), + __DEFINE_SPECIAL_MODE_NAME(FEC_BASER, "BASER"), + __DEFINE_LINK_MODE_NAME(50000, KR, Full), + __DEFINE_LINK_MODE_NAME(50000, SR, Full), + __DEFINE_LINK_MODE_NAME(50000, CR, Full), + __DEFINE_LINK_MODE_NAME(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(50000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, KR2, Full), + __DEFINE_LINK_MODE_NAME(100000, SR2, Full), + __DEFINE_LINK_MODE_NAME(100000, CR2, Full), + __DEFINE_LINK_MODE_NAME(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(100000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, KR4, Full), + __DEFINE_LINK_MODE_NAME(200000, SR4, Full), + __DEFINE_LINK_MODE_NAME(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(200000, DR4, Full), + __DEFINE_LINK_MODE_NAME(200000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100, T1, Full), + __DEFINE_LINK_MODE_NAME(1000, T1, Full), + __DEFINE_LINK_MODE_NAME(400000, KR8, Full), + __DEFINE_LINK_MODE_NAME(400000, SR8, Full), + __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_NAME(400000, DR8, Full), + __DEFINE_LINK_MODE_NAME(400000, CR8, Full), +}; +static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); + +/* return false if legacy contained non-0 deprecated fields + * maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +int __ethtool_get_link(struct net_device *dev) +{ + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + return netif_running(dev) && dev->ethtool_ops->get_link(dev); +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h new file mode 100644 index 000000000000..5c5f7dc90cd4 --- /dev/null +++ b/net/ethtool/common.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ETHTOOL_COMMON_H +#define _ETHTOOL_COMMON_H + +#include <linux/netdevice.h> +#include <linux/ethtool.h> + +/* compose link mode index from speed, type and duplex */ +#define ETHTOOL_LINK_MODE(speed, type, duplex) \ + ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT + +extern const char +netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; +extern const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN]; +extern const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; +extern const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; +extern const char link_mode_names[][ETH_GSTRING_LEN]; + +int __ethtool_get_link(struct net_device *dev); + +bool convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings); + +#endif /* _ETHTOOL_COMMON_H */ diff --git a/net/core/ethtool.c b/net/ethtool/ioctl.c index cd9bc67381b2..182bffbffa78 100644 --- a/net/core/ethtool.c +++ b/net/ethtool/ioctl.c @@ -26,6 +26,9 @@ #include <net/devlink.h> #include <net/xdp_sock.h> #include <net/flow_offload.h> +#include <linux/ethtool_netlink.h> + +#include "common.h" /* * Some useful ethtool_ops methods that're device independent. @@ -54,88 +57,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info); #define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) -static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { - [NETIF_F_SG_BIT] = "tx-scatter-gather", - [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", - [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", - [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", - [NETIF_F_HIGHDMA_BIT] = "highdma", - [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", - - [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", - [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", - [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", - [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", - [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", - [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", - [NETIF_F_GRO_BIT] = "rx-gro", - [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", - [NETIF_F_LRO_BIT] = "rx-lro", - - [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", - [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", - [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", - [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", - [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", - [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", - [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", - [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", - [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", - [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", - [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", - [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", - [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", - [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", - - [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", - [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", - [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", - [NETIF_F_RXHASH_BIT] = "rx-hashing", - [NETIF_F_RXCSUM_BIT] = "rx-checksum", - [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", - [NETIF_F_LOOPBACK_BIT] = "loopback", - [NETIF_F_RXFCS_BIT] = "rx-fcs", - [NETIF_F_RXALL_BIT] = "rx-all", - [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", - [NETIF_F_HW_TC_BIT] = "hw-tc-offload", - [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", - [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", - [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", - [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", - [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", - [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", -}; - -static const char -rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { - [ETH_RSS_HASH_TOP_BIT] = "toeplitz", - [ETH_RSS_HASH_XOR_BIT] = "xor", - [ETH_RSS_HASH_CRC32_BIT] = "crc32", -}; - -static const char -tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", - [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", - [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", -}; - -static const char -phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", - [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", - [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", -}; - static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -234,6 +155,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) !ops->get_ethtool_phy_stats) return phy_ethtool_get_sset_count(dev->phydev); + if (sset == ETH_SS_LINK_MODES) + return __ETHTOOL_LINK_MODE_MASK_NBITS; + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -258,6 +182,9 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) phy_ethtool_get_strings(dev->phydev, data); + else if (stringset == ETH_SS_LINK_MODES) + memcpy(data, link_mode_names, + __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -432,54 +359,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); -/* return false if legacy contained non-0 deprecated fields - * maxtxpkt/maxrxpkt. rest of ksettings always updated - */ -static bool -convert_legacy_settings_to_link_ksettings( - struct ethtool_link_ksettings *link_ksettings, - const struct ethtool_cmd *legacy_settings) -{ - bool retval = true; - - memset(link_ksettings, 0, sizeof(*link_ksettings)); - - /* This is used to tell users that driver is still using these - * deprecated legacy fields, and they should not use - * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS - */ - if (legacy_settings->maxtxpkt || - legacy_settings->maxrxpkt) - retval = false; - - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.supported, - legacy_settings->supported); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.advertising, - legacy_settings->advertising); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.lp_advertising, - legacy_settings->lp_advertising); - link_ksettings->base.speed - = ethtool_cmd_speed(legacy_settings); - link_ksettings->base.duplex - = legacy_settings->duplex; - link_ksettings->base.port - = legacy_settings->port; - link_ksettings->base.phy_address - = legacy_settings->phy_address; - link_ksettings->base.autoneg - = legacy_settings->autoneg; - link_ksettings->base.mdio_support - = legacy_settings->mdio_support; - link_ksettings->base.eth_tp_mdix - = legacy_settings->eth_tp_mdix; - link_ksettings->base.eth_tp_mdix_ctrl - = legacy_settings->eth_tp_mdix_ctrl; - return retval; -} - /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ @@ -693,7 +572,12 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + if (err >= 0) { + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } + return err; } /* Query device for its ethtool_cmd settings. @@ -742,6 +626,7 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; + int ret; ASSERT_RTNL(); @@ -754,7 +639,12 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return -EINVAL; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + if (ret >= 0) { + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } + return ret; } static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, @@ -1475,12 +1365,12 @@ static int ethtool_nway_reset(struct net_device *dev) static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + int link = __ethtool_get_link(dev); - if (!dev->ethtool_ops->get_link) - return -EOPNOTSUPP; - - edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + if (link < 0) + return link; + edata.data = link; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; @@ -2170,8 +2060,8 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GET_TS_INFO; - if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); + if (phy_has_tsinfo(phydev)) { + err = phy_ts_info(phydev, &info); } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, &info); } else { diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c new file mode 100644 index 000000000000..5d16cb4e8693 --- /dev/null +++ b/net/ethtool/linkinfo.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct linkinfo_req_info { + struct ethnl_req_info base; +}; + +struct linkinfo_reply_data { + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; +}; + +#define LINKINFO_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkinfo_reply_data, base) + +static const struct nla_policy +linkinfo_get_policy[ETHTOOL_A_LINKINFO_MAX + 1] = { + [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT }, +}; + +static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + data->lsettings = &data->ksettings.base; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = __ethtool_get_link_ksettings(dev, &data->ksettings); + if (ret < 0 && info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + ethnl_ops_complete(dev); + + return ret; +} + +static int linkinfo_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + + 0; +} + +static int linkinfo_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); + + if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, + data->lsettings->phy_address) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, + data->lsettings->eth_tp_mdix) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, + data->lsettings->eth_tp_mdix_ctrl) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, + data->lsettings->transceiver)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkinfo_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKINFO_GET, + .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, + .max_attr = ETHTOOL_A_LINKINFO_MAX, + .req_info_size = sizeof(struct linkinfo_req_info), + .reply_data_size = sizeof(struct linkinfo_reply_data), + .request_policy = linkinfo_get_policy, + + .prepare_data = linkinfo_prepare_data, + .reply_size = linkinfo_reply_size, + .fill_reply = linkinfo_fill_reply, +}; + +/* LINKINFO_SET */ + +static const struct nla_policy +linkinfo_set_policy[ETHTOOL_A_LINKINFO_MAX + 1] = { + [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT }, +}; + +int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_LINKINFO_MAX + 1]; + struct ethtool_link_ksettings ksettings = {}; + struct ethtool_link_settings *lsettings; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_LINKINFO_MAX, linkinfo_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKINFO_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_link_ksettings || + !dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = __ethtool_get_link_ksettings(dev, &ksettings); + if (ret < 0) { + if (info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out_ops; + } + lsettings = &ksettings.base; + + ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); + ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], + &mod); + ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, + tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); + ret = 0; + if (!mod) + goto out_ops; + + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) + GENL_SET_ERR_MSG(info, "link settings update failed"); + else + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c new file mode 100644 index 000000000000..96f20be64553 --- /dev/null +++ b/net/ethtool/linkmodes.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct linkmodes_req_info { + struct ethnl_req_info base; +}; + +struct linkmodes_reply_data { + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; + bool peer_empty; +}; + +#define LINKMODES_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkmodes_reply_data, base) + +static const struct nla_policy +linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { + [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, +}; + +static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + data->lsettings = &data->ksettings.base; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = __ethtool_get_link_ksettings(dev, &data->ksettings); + if (ret < 0 && info) { + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out; + } + + data->peer_empty = + bitmap_empty(data->ksettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + +out: + ethnl_ops_complete(dev); + return ret; +} + +static int linkmodes_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + const struct ethtool_link_ksettings *ksettings = &data->ksettings; + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int len, ret; + + len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + + 0; + ret = ethnl_bitset_size(ksettings->link_modes.advertising, + ksettings->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + if (!data->peer_empty) { + ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising, + NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + } + + return len; +} + +static int linkmodes_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) + return -EMSGSIZE; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS, + ksettings->link_modes.advertising, + ksettings->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, + compact); + if (ret < 0) + return -EMSGSIZE; + if (!data->peer_empty) { + ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER, + ksettings->link_modes.lp_advertising, + NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return -EMSGSIZE; + } + + if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) || + nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkmodes_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKMODES_GET, + .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, + .max_attr = ETHTOOL_A_LINKMODES_MAX, + .req_info_size = sizeof(struct linkmodes_req_info), + .reply_data_size = sizeof(struct linkmodes_reply_data), + .request_policy = linkmodes_get_policy, + + .prepare_data = linkmodes_prepare_data, + .reply_size = linkmodes_reply_size, + .fill_reply = linkmodes_fill_reply, +}; + +/* LINKMODES_SET */ + +struct link_mode_info { + int speed; + u8 duplex; +}; + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .duplex = __DUPLEX_ ## _duplex \ + } +#define __DUPLEX_Half DUPLEX_HALF +#define __DUPLEX_Full DUPLEX_FULL +#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ + .speed = SPEED_UNKNOWN, \ + .duplex = DUPLEX_UNKNOWN, \ + } + +static const struct link_mode_info link_mode_params[] = { + __DEFINE_LINK_MODE_PARAMS(10, T, Half), + __DEFINE_LINK_MODE_PARAMS(10, T, Full), + __DEFINE_LINK_MODE_PARAMS(100, T, Half), + __DEFINE_LINK_MODE_PARAMS(100, T, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T, Half), + __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), + __DEFINE_SPECIAL_MODE_PARAMS(TP), + __DEFINE_SPECIAL_MODE_PARAMS(AUI), + __DEFINE_SPECIAL_MODE_PARAMS(MII), + __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), + __DEFINE_SPECIAL_MODE_PARAMS(BNC), + __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Pause), + __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), + __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Backplane), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { + .speed = SPEED_10000, + .duplex = DUPLEX_FULL, + }, + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(1000, X, Full), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), + __DEFINE_LINK_MODE_PARAMS(2500, T, Full), + __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, T1, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), +}; + +static const struct nla_policy +linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { + [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, + [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, +}; + +/* Set advertised link modes to all supported modes matching requested speed + * and duplex values. Called when autonegotiation is on, speed or duplex is + * requested but no link mode change. This is done in userspace with ioctl() + * interface, move it into kernel for netlink. + * Returns true if advertised modes bitmap was modified. + */ +static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, + bool req_speed, bool req_duplex) +{ + unsigned long *advertising = ksettings->link_modes.advertising; + unsigned long *supported = ksettings->link_modes.supported; + DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) != + __ETHTOOL_LINK_MODE_MASK_NBITS); + + bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); + + for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { + const struct link_mode_info *info = &link_mode_params[i]; + + if (info->speed == SPEED_UNKNOWN) + continue; + if (test_bit(i, supported) && + (!req_speed || info->speed == ksettings->base.speed) && + (!req_duplex || info->duplex == ksettings->base.duplex)) + set_bit(i, advertising); + else + clear_bit(i, advertising); + } + + return !bitmap_equal(old_adv, advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, + struct ethtool_link_ksettings *ksettings, + bool *mod) +{ + struct ethtool_link_settings *lsettings = &ksettings->base; + bool req_speed, req_duplex; + int ret; + + *mod = false; + req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; + req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; + + ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], + mod); + ret = ethnl_update_bitset(ksettings->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, + info->extack, mod); + if (ret < 0) + return ret; + ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], + mod); + ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], + mod); + + if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && + (req_speed || req_duplex) && + ethnl_auto_linkmodes(ksettings, req_speed, req_duplex)) + *mod = true; + + return 0; +} + +int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_LINKMODES_MAX + 1]; + struct ethtool_link_ksettings ksettings = {}; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_LINKMODES_MAX, linkmodes_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKMODES_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_link_ksettings || + !dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = __ethtool_get_link_ksettings(dev, &ksettings); + if (ret < 0) { + if (info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out_ops; + } + + ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod); + if (ret < 0) + goto out_ops; + + if (mod) { + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) + GENL_SET_ERR_MSG(info, "link settings update failed"); + else + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c new file mode 100644 index 000000000000..2740cde0a182 --- /dev/null +++ b/net/ethtool/linkstate.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct linkstate_req_info { + struct ethnl_req_info base; +}; + +struct linkstate_reply_data { + struct ethnl_reply_data base; + int link; +}; + +#define LINKSTATE_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkstate_reply_data, base) + +static const struct nla_policy +linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { + [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, +}; + +static int linkstate_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->link = __ethtool_get_link(dev); + ethnl_ops_complete(dev); + + return 0; +} + +static int linkstate_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + + 0; +} + +static int linkstate_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + + if (data->link >= 0 && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkstate_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKSTATE_GET, + .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER, + .max_attr = ETHTOOL_A_LINKSTATE_MAX, + .req_info_size = sizeof(struct linkstate_req_info), + .reply_data_size = sizeof(struct linkstate_reply_data), + .request_policy = linkstate_get_policy, + + .prepare_data = linkstate_prepare_data, + .reply_size = linkstate_reply_size, + .fill_reply = linkstate_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c new file mode 100644 index 000000000000..86b79f9bc08d --- /dev/null +++ b/net/ethtool/netlink.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <net/sock.h> +#include <linux/ethtool_netlink.h> +#include "netlink.h" + +static struct genl_family ethtool_genl_family; + +static bool ethnl_ok __read_mostly; +static u32 ethnl_bcast_seq; + +static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = { + [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = { .type = NLA_U32 }, +}; + +/** + * ethnl_parse_header() - parse request header + * @req_info: structure to put results into + * @header: nest attribute with request header + * @net: request netns + * @extack: netlink extack for error reporting + * @require_dev: fail if no device identified in header + * + * Parse request header in nested attribute @nest and puts results into + * the structure pointed to by @req_info. Extack from @info is used for error + * reporting. If req_info->dev is not null on return, reference to it has + * been taken. If error is returned, *req_info is null initialized and no + * reference is held. + * + * Return: 0 on success or negative error code + */ +int ethnl_parse_header(struct ethnl_req_info *req_info, + const struct nlattr *header, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1]; + const struct nlattr *devname_attr; + struct net_device *dev = NULL; + int ret; + + if (!header) { + NL_SET_ERR_MSG(extack, "request header missing"); + return -EINVAL; + } + ret = nla_parse_nested(tb, ETHTOOL_A_HEADER_MAX, header, + ethnl_header_policy, extack); + if (ret < 0) + return ret; + devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; + + if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { + u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); + + dev = dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_HEADER_DEV_INDEX], + "no device matches ifindex"); + return -ENODEV; + } + /* if both ifindex and ifname are passed, they must match */ + if (devname_attr && + strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { + dev_put(dev); + NL_SET_ERR_MSG_ATTR(extack, header, + "ifindex and name do not match"); + return -ENODEV; + } + } else if (devname_attr) { + dev = dev_get_by_name(net, nla_data(devname_attr)); + if (!dev) { + NL_SET_ERR_MSG_ATTR(extack, devname_attr, + "no device matches name"); + return -ENODEV; + } + } else if (require_dev) { + NL_SET_ERR_MSG_ATTR(extack, header, + "neither ifindex nor name specified"); + return -EINVAL; + } + + if (dev && !netif_device_present(dev)) { + dev_put(dev); + NL_SET_ERR_MSG(extack, "device not present"); + return -ENODEV; + } + + req_info->dev = dev; + if (tb[ETHTOOL_A_HEADER_FLAGS]) + req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); + + return 0; +} + +/** + * ethnl_fill_reply_header() - Put common header into a reply message + * @skb: skb with the message + * @dev: network device to describe in header + * @attrtype: attribute type to use for the nest + * + * Create a nested attribute with attributes describing given network device. + * + * Return: 0 on success, error value (-EMSGSIZE only) on error + */ +int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, + u16 attrtype) +{ + struct nlattr *nest; + + if (!dev) + return 0; + nest = nla_nest_start(skb, attrtype); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || + nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) + goto nla_put_failure; + /* If more attributes are put into reply header, ethnl_header_size() + * must be updated to account for them. + */ + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +/** + * ethnl_reply_init() - Create skb for a reply and fill device identification + * @payload: payload length (without netlink and genetlink header) + * @dev: device the reply is about (may be null) + * @cmd: ETHTOOL_MSG_* message type for reply + * @info: genetlink info of the received packet we respond to + * @ehdrp: place to store payload pointer returned by genlmsg_new() + * + * Return: pointer to allocated skb on success, NULL on error + */ +struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, + u16 hdr_attrtype, struct genl_info *info, + void **ehdrp) +{ + struct sk_buff *skb; + + skb = genlmsg_new(payload, GFP_KERNEL); + if (!skb) + goto err; + *ehdrp = genlmsg_put_reply(skb, info, ðtool_genl_family, 0, cmd); + if (!*ehdrp) + goto err_free; + + if (dev) { + int ret; + + ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); + if (ret < 0) + goto err_free; + } + return skb; + +err_free: + nlmsg_free(skb); +err: + if (info) + GENL_SET_ERR_MSG(info, "failed to setup reply message"); + return NULL; +} + +static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) +{ + return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, + cmd); +} + +static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) +{ + return genlmsg_multicast_netns(ðtool_genl_family, dev_net(dev), skb, + 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); +} + +/* GET request helpers */ + +/** + * struct ethnl_dump_ctx - context structure for generic dumpit() callback + * @ops: request ops of currently processed message type + * @req_info: parsed request header of processed request + * @pos_hash: saved iteration position - hashbucket + * @pos_idx: saved iteration position - index + * + * These parameters are kept in struct netlink_callback as context preserved + * between iterations. They are initialized by ethnl_default_start() and used + * in ethnl_default_dumpit() and ethnl_default_done(). + */ +struct ethnl_dump_ctx { + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct ethnl_reply_data *reply_data; + int pos_hash; + int pos_idx; +}; + +static const struct ethnl_request_ops * +ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { + [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, + [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, +}; + +static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) +{ + return (struct ethnl_dump_ctx *)cb->ctx; +} + +/** + * ethnl_default_parse() - Parse request message + * @req_info: pointer to structure to put data into + * @nlhdr: pointer to request message header + * @net: request netns + * @request_ops: struct request_ops for request type + * @extack: netlink extack for error reporting + * @require_dev: fail if no device identified in header + * + * Parse universal request header and call request specific ->parse_request() + * callback (if defined) to parse the rest of the message. + * + * Return: 0 on success or negative error code + */ +static int ethnl_default_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + const struct ethnl_request_ops *request_ops, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr **tb; + int ret; + + tb = kmalloc_array(request_ops->max_attr + 1, sizeof(tb[0]), + GFP_KERNEL); + if (!tb) + return -ENOMEM; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, request_ops->max_attr, + request_ops->request_policy, extack); + if (ret < 0) + goto out; + ret = ethnl_parse_header(req_info, tb[request_ops->hdr_attr], net, + extack, require_dev); + if (ret < 0) + goto out; + + if (request_ops->parse_request) { + ret = request_ops->parse_request(req_info, tb, extack); + if (ret < 0) + goto out; + } + + ret = 0; +out: + kfree(tb); + return ret; +} + +/** + * ethnl_init_reply_data() - Initialize reply data for GET request + * @req_info: pointer to embedded struct ethnl_req_info + * @ops: instance of struct ethnl_request_ops describing the layout + * @dev: network device to initialize the reply for + * + * Fills the reply data part with zeros and sets the dev member. Must be called + * before calling the ->fill_reply() callback (for each iteration when handling + * dump requests). + */ +static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, + const struct ethnl_request_ops *ops, + struct net_device *dev) +{ + memset(reply_data, 0, ops->reply_data_size); + reply_data->dev = dev; +} + +/* default ->doit() handler for GET type requests */ +static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_reply_data *reply_data = NULL; + struct ethnl_req_info *req_info = NULL; + const u8 cmd = info->genlhdr->cmd; + const struct ethnl_request_ops *ops; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ops = ethnl_default_requests[cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ret = ethnl_default_parse(req_info, info->nlhdr, genl_info_net(info), ops, + info->extack, !ops->allow_nodev_do); + if (ret < 0) + goto err_dev; + ethnl_init_reply_data(reply_data, ops, req_info->dev); + + rtnl_lock(); + ret = ops->prepare_data(req_info, reply_data, info); + rtnl_unlock(); + if (ret < 0) + goto err_cleanup; + ret = ops->reply_size(req_info, reply_data); + if (ret < 0) + goto err_cleanup; + reply_len = ret; + ret = -ENOMEM; + rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, + ops->hdr_attr, info, &reply_payload); + if (!rskb) + goto err_cleanup; + ret = ops->fill_reply(rskb, req_info, reply_data); + if (ret < 0) + goto err_msg; + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + + genlmsg_end(rskb, reply_payload); + if (req_info->dev) + dev_put(req_info->dev); + kfree(reply_data); + kfree(req_info); + return genlmsg_reply(rskb, info); + +err_msg: + WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); + nlmsg_free(rskb); +err_cleanup: + if (ops->cleanup_data) + ops->cleanup_data(reply_data); +err_dev: + if (req_info->dev) + dev_put(req_info->dev); + kfree(reply_data); + kfree(req_info); + return ret; +} + +static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, + const struct ethnl_dump_ctx *ctx) +{ + int ret; + + ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); + rtnl_lock(); + ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL); + rtnl_unlock(); + if (ret < 0) + goto out; + ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); + if (ret < 0) + goto out; + ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); + +out: + if (ctx->ops->cleanup_data) + ctx->ops->cleanup_data(ctx->reply_data); + ctx->reply_data->dev = NULL; + return ret; +} + +/* Default ->dumpit() handler for GET requests. Device iteration copied from + * rtnl_dump_ifinfo(); we have to be more careful about device hashtable + * persistence as we cannot guarantee to hold RTNL lock through the whole + * function as rtnetnlink does. + */ +static int ethnl_default_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + unsigned int seq; + + head = &net->dev_index_head[h]; + +restart_chain: + seq = net->dev_base_seq; + cb->seq = seq; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + dev_hold(dev); + rtnl_unlock(); + + ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, + ctx->ops->reply_cmd); + if (!ehdr) { + dev_put(dev); + ret = -EMSGSIZE; + goto out; + } + ret = ethnl_default_dump_one(skb, dev, ctx); + dev_put(dev); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto lock_and_cont; + if (likely(skb->len)) + ret = skb->len; + goto out; + } + genlmsg_end(skb, ehdr); +lock_and_cont: + rtnl_lock(); + if (net->dev_base_seq != seq) { + s_idx = idx + 1; + goto restart_chain; + } +cont: + idx++; + } + + } + rtnl_unlock(); + +out: + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return ret; +} + +/* generic ->start() handler for GET requests */ +static int ethnl_default_start(struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct genlmsghdr *ghdr; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ghdr = nlmsg_data(cb->nlh); + ops = ethnl_default_requests[ghdr->cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops, + cb->extack, false); + if (req_info->dev) { + /* We ignore device specification in dump requests but as the + * same parser as for non-dump (doit) requests is used, it + * would take reference to the device if it finds one + */ + dev_put(req_info->dev); + req_info->dev = NULL; + } + if (ret < 0) + goto free_reply_data; + + ctx->ops = ops; + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_hash = 0; + ctx->pos_idx = 0; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +/* default ->done() handler for GET requests */ +static int ethnl_default_done(struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + + kfree(ctx->reply_data); + kfree(ctx->req_info); + + return 0; +} + +static const struct ethnl_request_ops * +ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { + [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, +}; + +/* default notification handler */ +static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, + const void *data) +{ + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct sk_buff *skb; + void *reply_payload; + int reply_len; + int ret; + + if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || + !ethnl_default_notify_ops[cmd], + "unexpected notification type %u\n", cmd)) + return; + ops = ethnl_default_notify_ops[cmd]; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return; + } + + req_info->dev = dev; + req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; + + ethnl_init_reply_data(reply_data, ops, dev); + ret = ops->prepare_data(req_info, reply_data, NULL); + if (ret < 0) + goto err_cleanup; + ret = ops->reply_size(req_info, reply_data); + if (ret < 0) + goto err_cleanup; + reply_len = ret; + ret = -ENOMEM; + skb = genlmsg_new(reply_len, GFP_KERNEL); + if (!skb) + goto err_cleanup; + reply_payload = ethnl_bcastmsg_put(skb, cmd); + if (!reply_payload) + goto err_skb; + ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); + if (ret < 0) + goto err_msg; + ret = ops->fill_reply(skb, req_info, reply_data); + if (ret < 0) + goto err_msg; + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + + genlmsg_end(skb, reply_payload); + kfree(reply_data); + kfree(req_info); + ethnl_multicast(skb, dev); + return; + +err_msg: + WARN_ONCE(ret == -EMSGSIZE, + "calculated message payload length (%d) not sufficient\n", + reply_len); +err_skb: + nlmsg_free(skb); +err_cleanup: + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + kfree(reply_data); + kfree(req_info); + return; +} + +/* notifications */ + +typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, + const void *data); + +static const ethnl_notify_handler_t ethnl_notify_handlers[] = { + [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, +}; + +void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) +{ + if (unlikely(!ethnl_ok)) + return; + ASSERT_RTNL(); + + if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && + ethnl_notify_handlers[cmd])) + ethnl_notify_handlers[cmd](dev, cmd, data); + else + WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", + cmd, netdev_name(dev)); +} +EXPORT_SYMBOL(ethtool_notify); + +/* genetlink setup */ + +static const struct genl_ops ethtool_genl_ops[] = { + { + .cmd = ETHTOOL_MSG_STRSET_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_LINKINFO_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_LINKINFO_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_linkinfo, + }, + { + .cmd = ETHTOOL_MSG_LINKMODES_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_LINKMODES_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_linkmodes, + }, + { + .cmd = ETHTOOL_MSG_LINKSTATE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, +}; + +static const struct genl_multicast_group ethtool_nl_mcgrps[] = { + [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, +}; + +static struct genl_family ethtool_genl_family = { + .name = ETHTOOL_GENL_NAME, + .version = ETHTOOL_GENL_VERSION, + .netnsok = true, + .parallel_ops = true, + .ops = ethtool_genl_ops, + .n_ops = ARRAY_SIZE(ethtool_genl_ops), + .mcgrps = ethtool_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), +}; + +/* module setup */ + +static int __init ethnl_init(void) +{ + int ret; + + ret = genl_register_family(ðtool_genl_family); + if (WARN(ret < 0, "ethtool: genetlink family registration failed")) + return ret; + ethnl_ok = true; + + return 0; +} + +subsys_initcall(ethnl_init); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h new file mode 100644 index 000000000000..da9d6521a4eb --- /dev/null +++ b/net/ethtool/netlink.h @@ -0,0 +1,341 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_NETLINK_H +#define _NET_ETHTOOL_NETLINK_H + +#include <linux/ethtool_netlink.h> +#include <linux/netdevice.h> +#include <net/genetlink.h> +#include <net/sock.h> + +struct ethnl_req_info; + +int ethnl_parse_header(struct ethnl_req_info *req_info, + const struct nlattr *nest, struct net *net, + struct netlink_ext_ack *extack, bool require_dev); +int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, + u16 attrtype); +struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, + u16 hdr_attrtype, struct genl_info *info, + void **ehdrp); + +/** + * ethnl_strz_size() - calculate attribute length for fixed size string + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * + * Return: total length of an attribute with null terminated string from @s + */ +static inline int ethnl_strz_size(const char *s) +{ + return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); +} + +/** + * ethnl_put_strz() - put string attribute with fixed size string + * @skb: skb with the message + * @attrype: attribute type + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * + * Puts an attribute with null terminated string from @s into the message. + * + * Return: 0 on success, negative error code on failure + */ +static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, + const char *s) +{ + unsigned int len = strnlen(s, ETH_GSTRING_LEN); + struct nlattr *attr; + + attr = nla_reserve(skb, attrtype, len + 1); + if (!attr) + return -EMSGSIZE; + + memcpy(nla_data(attr), s, len); + ((char *)nla_data(attr))[len] = '\0'; + return 0; +} + +/** + * ethnl_update_u32() - update u32 value from NLA_U32 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Copy the u32 value from NLA_U32 netlink attribute @attr into variable + * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod + * is set to true if this function changed the value of *dst, otherwise it + * is left as is. + */ +static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + u32 val; + + if (!attr) + return; + val = nla_get_u32(attr); + if (*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_u8() - update u8 value from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Copy the u8 value from NLA_U8 netlink attribute @attr into variable + * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod + * is set to true if this function changed the value of *dst, otherwise it + * is left as is. + */ +static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = nla_get_u8(attr); + if (*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable + * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is + * null. Bool pointed to by @mod is set to true if this function changed the + * logical value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = !!nla_get_u8(attr); + if (!!*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_binary() - update binary data from NLA_BINARY atribute + * @dst: value to update + * @len: destination buffer length + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block + * of length @len at @dst by attribute payload; do nothing if @attr is null. + * Bool pointed to by @mod is set to true if this function changed the logical + * value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_binary(void *dst, unsigned int len, + const struct nlattr *attr, bool *mod) +{ + if (!attr) + return; + if (nla_len(attr) < len) + len = nla_len(attr); + if (!memcmp(dst, nla_data(attr), len)) + return; + + memcpy(dst, nla_data(attr), len); + *mod = true; +} + +/** + * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Update bits in u32 value which are set in attribute's mask to values from + * attribute's value. Do nothing if @attr is null or the value wouldn't change; + * otherwise, set bool pointed to by @mod to true. + */ +static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + struct nla_bitfield32 change; + u32 newval; + + if (!attr) + return; + change = nla_get_bitfield32(attr); + newval = (*dst & ~change.selector) | (change.value & change.selector); + if (*dst == newval) + return; + + *dst = newval; + *mod = true; +} + +/** + * ethnl_reply_header_size() - total size of reply header + * + * This is an upper estimate so that we do not need to hold RTNL lock longer + * than necessary (to prevent rename between size estimate and composing the + * message). Accounts only for device ifindex and name as those are the only + * attributes ethnl_fill_reply_header() puts into the reply header. + */ +static inline unsigned int ethnl_reply_header_size(void) +{ + return nla_total_size(nla_total_size(sizeof(u32)) + + nla_total_size(IFNAMSIZ)); +} + +/* GET request handling */ + +/* Unified processing of GET requests uses two data structures: request info + * and reply data. Request info holds information parsed from client request + * and its stays constant through all request processing. Reply data holds data + * retrieved from ethtool_ops callbacks or other internal sources which is used + * to compose the reply. When processing a dump request, request info is filled + * only once (when the request message is parsed) but reply data is filled for + * each reply message. + * + * Both structures consist of part common for all request types (struct + * ethnl_req_info and struct ethnl_reply_data defined below) and optional + * parts specific for each request type. Common part always starts at offset 0. + */ + +/** + * struct ethnl_req_info - base type of request information for GET requests + * @dev: network device the request is for (may be null) + * @flags: request flags common for all request types + * + * This is a common base for request specific structures holding data from + * parsed userspace request. These always embed struct ethnl_req_info at + * zero offset. + */ +struct ethnl_req_info { + struct net_device *dev; + u32 flags; +}; + +/** + * struct ethnl_reply_data - base type of reply data for GET requests + * @dev: device for current reply message; in single shot requests it is + * equal to ðnl_req_info.dev; in dumps it's different for each + * reply message + * + * This is a common base for request specific structures holding data for + * kernel reply message. These always embed struct ethnl_reply_data at zero + * offset. + */ +struct ethnl_reply_data { + struct net_device *dev; +}; + +static inline int ethnl_ops_begin(struct net_device *dev) +{ + if (dev && dev->ethtool_ops->begin) + return dev->ethtool_ops->begin(dev); + else + return 0; +} + +static inline void ethnl_ops_complete(struct net_device *dev) +{ + if (dev && dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); +} + +/** + * struct ethnl_request_ops - unified handling of GET requests + * @request_cmd: command id for request (GET) + * @reply_cmd: command id for reply (GET_REPLY) + * @hdr_attr: attribute type for request header + * @max_attr: maximum (top level) attribute type + * @req_info_size: size of request info + * @reply_data_size: size of reply data + * @request_policy: netlink policy for message contents + * @allow_nodev_do: allow non-dump request with no device identification + * @parse_request: + * Parse request except common header (struct ethnl_req_info). Common + * header is already filled on entry, the rest up to @repdata_offset + * is zero initialized. This callback should only modify type specific + * request info by parsed attributes from request message. + * @prepare_data: + * Retrieve and prepare data needed to compose a reply message. Calls to + * ethtool_ops handlers are limited to this callback. Common reply data + * (struct ethnl_reply_data) is filled on entry, type specific part after + * it is zero initialized. This callback should only modify the type + * specific part of reply data. Device identification from struct + * ethnl_reply_data is to be used as for dump requests, it iterates + * through network devices while dev member of struct ethnl_req_info + * points to the device from client request. + * @reply_size: + * Estimate reply message size. Returned value must be sufficient for + * message payload without common reply header. The callback may returned + * estimate higher than actual message size if exact calculation would + * not be worth the saved memory space. + * @fill_reply: + * Fill reply message payload (except for common header) from reply data. + * The callback must not generate more payload than previously called + * ->reply_size() estimated. + * @cleanup_data: + * Optional cleanup called when reply data is no longer needed. Can be + * used e.g. to free any additional data structures outside the main + * structure which were allocated by ->prepare_data(). When processing + * dump requests, ->cleanup() is called for each message. + * + * Description of variable parts of GET request handling when using the + * unified infrastructure. When used, a pointer to an instance of this + * structure is to be added to ðnl_default_requests array and generic + * handlers ethnl_default_doit(), ethnl_default_dumpit(), + * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; + * ethnl_default_notify() can be used in @ethnl_notify_handlers to send + * notifications of the corresponding type. + */ +struct ethnl_request_ops { + u8 request_cmd; + u8 reply_cmd; + u16 hdr_attr; + unsigned int max_attr; + unsigned int req_info_size; + unsigned int reply_data_size; + const struct nla_policy *request_policy; + bool allow_nodev_do; + + int (*parse_request)(struct ethnl_req_info *req_info, + struct nlattr **tb, + struct netlink_ext_ack *extack); + int (*prepare_data)(const struct ethnl_req_info *req_info, + struct ethnl_reply_data *reply_data, + struct genl_info *info); + int (*reply_size)(const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data); + int (*fill_reply)(struct sk_buff *skb, + const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data); + void (*cleanup_data)(struct ethnl_reply_data *reply_data); +}; + +/* request handlers */ + +extern const struct ethnl_request_ops ethnl_strset_request_ops; +extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; +extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; +extern const struct ethnl_request_ops ethnl_linkstate_request_ops; + +int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); + +#endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c new file mode 100644 index 000000000000..82a059c13c1c --- /dev/null +++ b/net/ethtool/strset.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> +#include <linux/phy.h> +#include "netlink.h" +#include "common.h" + +struct strset_info { + bool per_dev; + bool free_strings; + unsigned int count; + const char (*strings)[ETH_GSTRING_LEN]; +}; + +static const struct strset_info info_template[] = { + [ETH_SS_TEST] = { + .per_dev = true, + }, + [ETH_SS_STATS] = { + .per_dev = true, + }, + [ETH_SS_PRIV_FLAGS] = { + .per_dev = true, + }, + [ETH_SS_FEATURES] = { + .per_dev = false, + .count = ARRAY_SIZE(netdev_features_strings), + .strings = netdev_features_strings, + }, + [ETH_SS_RSS_HASH_FUNCS] = { + .per_dev = false, + .count = ARRAY_SIZE(rss_hash_func_strings), + .strings = rss_hash_func_strings, + }, + [ETH_SS_TUNABLES] = { + .per_dev = false, + .count = ARRAY_SIZE(tunable_strings), + .strings = tunable_strings, + }, + [ETH_SS_PHY_STATS] = { + .per_dev = true, + }, + [ETH_SS_PHY_TUNABLES] = { + .per_dev = false, + .count = ARRAY_SIZE(phy_tunable_strings), + .strings = phy_tunable_strings, + }, + [ETH_SS_LINK_MODES] = { + .per_dev = false, + .count = __ETHTOOL_LINK_MODE_MASK_NBITS, + .strings = link_mode_names, + }, +}; + +struct strset_req_info { + struct ethnl_req_info base; + u32 req_ids; + bool counts_only; +}; + +#define STRSET_REQINFO(__req_base) \ + container_of(__req_base, struct strset_req_info, base) + +struct strset_reply_data { + struct ethnl_reply_data base; + struct strset_info sets[ETH_SS_COUNT]; +}; + +#define STRSET_REPDATA(__reply_base) \ + container_of(__reply_base, struct strset_reply_data, base) + +static const struct nla_policy strset_get_policy[ETHTOOL_A_STRSET_MAX + 1] = { + [ETHTOOL_A_STRSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRSET_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +get_stringset_policy[ETHTOOL_A_STRINGSET_MAX + 1] = { + [ETHTOOL_A_STRINGSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 }, + [ETHTOOL_A_STRINGSET_COUNT] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSET_STRINGS] = { .type = NLA_REJECT }, +}; + +/** + * strset_include() - test if a string set should be included in reply + * @data: pointer to request data structure + * @id: id of string set to check (ETH_SS_* constants) + */ +static bool strset_include(const struct strset_req_info *info, + const struct strset_reply_data *data, u32 id) +{ + bool per_dev; + + BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids)); + + if (info->req_ids) + return info->req_ids & (1U << id); + per_dev = data->sets[id].per_dev; + if (!per_dev && !data->sets[id].strings) + return false; + + return data->base.dev ? per_dev : !per_dev; +} + +static int strset_get_id(const struct nlattr *nest, u32 *val, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ETHTOOL_A_STRINGSET_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_STRINGSET_MAX, nest, + get_stringset_policy, extack); + if (ret < 0) + return ret; + if (!tb[ETHTOOL_A_STRINGSET_ID]) + return -EINVAL; + + *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]); + return 0; +} + +static const struct nla_policy +strset_stringsets_policy[ETHTOOL_A_STRINGSETS_MAX + 1] = { + [ETHTOOL_A_STRINGSETS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED }, +}; + +static int strset_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct strset_req_info *req_info = STRSET_REQINFO(req_base); + struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS]; + struct nlattr *attr; + int rem, ret; + + if (!nest) + return 0; + ret = nla_validate_nested(nest, ETHTOOL_A_STRINGSETS_MAX, + strset_stringsets_policy, extack); + if (ret < 0) + return ret; + + req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY]; + nla_for_each_nested(attr, nest, rem) { + u32 id; + + if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET, + "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n", + nla_type(attr))) + return -EINVAL; + + ret = strset_get_id(attr, &id, extack); + if (ret < 0) + return ret; + if (ret >= ETH_SS_COUNT) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "unknown string set id"); + return -EOPNOTSUPP; + } + + req_info->req_ids |= (1U << id); + } + + return 0; +} + +static void strset_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct strset_reply_data *data = STRSET_REPDATA(reply_base); + unsigned int i; + + for (i = 0; i < ETH_SS_COUNT; i++) + if (data->sets[i].free_strings) { + kfree(data->sets[i].strings); + data->sets[i].strings = NULL; + data->sets[i].free_strings = false; + } +} + +static int strset_prepare_set(struct strset_info *info, struct net_device *dev, + unsigned int id, bool counts_only) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + void *strings; + int count, ret; + + if (id == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + ret = phy_ethtool_get_sset_count(dev->phydev); + else if (ops->get_sset_count && ops->get_strings) + ret = ops->get_sset_count(dev, id); + else + ret = -EOPNOTSUPP; + if (ret <= 0) { + info->count = 0; + return 0; + } + + count = ret; + if (!counts_only) { + strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); + if (!strings) + return -ENOMEM; + if (id == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + phy_ethtool_get_strings(dev->phydev, strings); + else + ops->get_strings(dev, id, strings); + info->strings = strings; + info->free_strings = true; + } + info->count = count; + + return 0; +} + +static int strset_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + struct strset_reply_data *data = STRSET_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + unsigned int i; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT); + memcpy(&data->sets, &info_template, sizeof(data->sets)); + + if (!dev) { + for (i = 0; i < ETH_SS_COUNT; i++) { + if ((req_info->req_ids & (1U << i)) && + data->sets[i].per_dev) { + if (info) + GENL_SET_ERR_MSG(info, "requested per device strings without dev"); + return -EINVAL; + } + } + return 0; + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto err_strset; + for (i = 0; i < ETH_SS_COUNT; i++) { + if (!strset_include(req_info, data, i) || + !data->sets[i].per_dev) + continue; + + ret = strset_prepare_set(&data->sets[i], dev, i, + req_info->counts_only); + if (ret < 0) + goto err_ops; + } + ethnl_ops_complete(dev); + + return 0; +err_ops: + ethnl_ops_complete(dev); +err_strset: + strset_cleanup_data(reply_base); + return ret; +} + +/* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */ +static int strset_set_size(const struct strset_info *info, bool counts_only) +{ + unsigned int len = 0; + unsigned int i; + + if (info->count == 0) + return 0; + if (counts_only) + return nla_total_size(2 * nla_total_size(sizeof(u32))); + + for (i = 0; i < info->count; i++) { + const char *str = info->strings[i]; + + /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */ + len += nla_total_size(nla_total_size(sizeof(u32)) + + ethnl_strz_size(str)); + } + /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */ + len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len); + + return nla_total_size(len); +} + +static int strset_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + const struct strset_reply_data *data = STRSET_REPDATA(reply_base); + unsigned int i; + int len = 0; + int ret; + + len += ethnl_reply_header_size(); + for (i = 0; i < ETH_SS_COUNT; i++) { + const struct strset_info *set_info = &data->sets[i]; + + if (!strset_include(req_info, data, i)) + continue; + + ret = strset_set_size(set_info, req_info->counts_only); + if (ret < 0) + return ret; + len += ret; + } + + return len; +} + +/* fill one string into reply */ +static int strset_fill_string(struct sk_buff *skb, + const struct strset_info *set_info, u32 idx) +{ + struct nlattr *string_attr; + const char *value; + + value = set_info->strings[idx]; + + string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING); + if (!string_attr) + return -EMSGSIZE; + if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) || + ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value)) + goto nla_put_failure; + nla_nest_end(skb, string_attr); + + return 0; +nla_put_failure: + nla_nest_cancel(skb, string_attr); + return -EMSGSIZE; +} + +/* fill one string set into reply */ +static int strset_fill_set(struct sk_buff *skb, + const struct strset_info *set_info, u32 id, + bool counts_only) +{ + struct nlattr *stringset_attr; + struct nlattr *strings_attr; + unsigned int i; + + if (!set_info->per_dev && !set_info->strings) + return -EOPNOTSUPP; + if (set_info->count == 0) + return 0; + stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET); + if (!stringset_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) || + nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count)) + goto nla_put_failure; + + if (!counts_only) { + strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS); + if (!strings_attr) + goto nla_put_failure; + for (i = 0; i < set_info->count; i++) { + if (strset_fill_string(skb, set_info, i) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, strings_attr); + } + + nla_nest_end(skb, stringset_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, stringset_attr); + return -EMSGSIZE; +} + +static int strset_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + const struct strset_reply_data *data = STRSET_REPDATA(reply_base); + struct nlattr *nest; + unsigned int i; + int ret; + + nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS); + if (!nest) + return -EMSGSIZE; + + for (i = 0; i < ETH_SS_COUNT; i++) { + if (strset_include(req_info, data, i)) { + ret = strset_fill_set(skb, &data->sets[i], i, + req_info->counts_only); + if (ret < 0) + goto nla_put_failure; + } + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return ret; +} + +const struct ethnl_request_ops ethnl_strset_request_ops = { + .request_cmd = ETHTOOL_MSG_STRSET_GET, + .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY, + .hdr_attr = ETHTOOL_A_STRSET_HEADER, + .max_attr = ETHTOOL_A_STRSET_MAX, + .req_info_size = sizeof(struct strset_req_info), + .reply_data_size = sizeof(struct strset_reply_data), + .request_policy = strset_get_policy, + .allow_nodev_do = true, + + .parse_request = strset_parse_request, + .prepare_data = strset_prepare_data, + .reply_size = strset_reply_size, + .fill_reply = strset_fill_reply, + .cleanup_data = strset_cleanup_data, +}; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index d40de84a637f..754d84b217f0 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -191,7 +191,7 @@ void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else -static inline void void hsr_debugfs_rename(struct net_device *dev) +static inline void hsr_debugfs_rename(struct net_device *dev) { } static inline void hsr_debugfs_init(struct hsr_priv *priv, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b9df9c09b84e..b92a42433a7d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -980,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t, /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. + * If 'find_first' is set, return the first matching + * fib alias, regardless of TOS and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, - u8 tos, u32 prio, u32 tb_id) + u8 tos, u32 prio, u32 tb_id, + bool find_first) { struct fib_alias *fa; @@ -998,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, continue; if (fa->tb_id != tb_id) break; + if (find_first) + return fa; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1063,9 +1068,6 @@ noleaf: return -ENOMEM; } -/* fib notifier for ADD is sent before calling fib_insert_alias with - * the expectation that the only possible failure ENOMEM - */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1118,11 +1120,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) return true; } +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old); + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1149,7 +1153,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, - tb->tb_id) : NULL; + tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already @@ -1217,12 +1221,17 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - err = call_fib_entry_notifiers(net, - FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, - extack); - if (err) - goto out_free_new_fa; + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, + tb->tb_id, true) == fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, + key, plen, + new_fa, extack); + if (err) + goto out_free_new_fa; + } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1244,12 +1253,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) { - event = FIB_EVENT_ENTRY_APPEND; + if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - } else { + else fa = fa_first; - } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1269,14 +1276,26 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); - if (err) - goto out_free_new_fa; - /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_fib_notif; + goto out_free_new_fa; + + /* The alias was already inserted, so the node must exist. */ + l = l ? l : fib_find_node(t, &tp, key); + if (WARN_ON_ONCE(!l)) + goto out_free_new_fa; + + if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == + new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, key, plen, + new_fa, extack); + if (err) + goto out_remove_new_fa; + } if (!plen) tb->tb_num_default++; @@ -1287,14 +1306,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_fib_notif: - /* notifier was sent that entry would be added to trie, but - * the add failed and need to recover. Only failure for - * fib_insert_alias is ENOMEM. - */ - NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, - plen, new_fa, NULL); +out_remove_new_fa: + fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1545,6 +1558,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, node_pull_suffix(tp, fa->fa_slen); } +static void fib_notify_alias_delete(struct net *net, u32 key, + struct hlist_head *fah, + struct fib_alias *fa_to_delete, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa_next, *fa_to_notify; + u32 tb_id = fa_to_delete->tb_id; + u8 slen = fa_to_delete->fa_slen; + enum fib_event_type fib_event; + + /* Do not notify if we do not care about the route. */ + if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) + return; + + /* Determine if the route should be replaced by the next route in the + * list. + */ + fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, + struct fib_alias, fa_list); + if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { + fib_event = FIB_EVENT_ENTRY_REPLACE; + fa_to_notify = fa_next; + } else { + fib_event = FIB_EVENT_ENTRY_DEL; + fa_to_notify = fa_to_delete; + } + call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, + fa_to_notify, extack); +} + /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1566,7 +1609,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!l) return -ESRCH; - fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false); if (!fa) return -ESRCH; @@ -1598,8 +1641,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete, extack); + fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1923,10 +1965,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) continue; } - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); + fib_notify_alias_delete(net, n->key, &n->leaf, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2022,6 +2062,7 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct netlink_ext_ack *extack) { struct fib_alias *fa; + int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { @@ -2036,8 +2077,12 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, if (tb->tb_id != fa->tb_id) continue; - err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, + if (fa->fa_slen == last_slen) + continue; + + last_slen = fa->fa_slen; + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, + l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 4de7e962d3da..2e6d1b7a7bc9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head, if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; - skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fcb2cd167f64..9684af02e0a5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1193,6 +1193,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "tcp_no_ssthresh_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d885ba868822..f09fbc85b108 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -443,8 +443,6 @@ void tcp_init_sock(struct sock *sk) tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; - sk->sk_state = TCP_CLOSE; - sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -1778,6 +1776,8 @@ static int tcp_zerocopy_receive(struct sock *sk, while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { if (skb) { + if (zc->recv_skip_hint > 0) + break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1b3d032a4df2..8f8eefd3a3ce 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -40,8 +40,8 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (4U<<3) -#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; -static int hystart_ack_delta __read_mostly = 2; +static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); -module_param(hystart_ack_delta, int, 0644); -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); +module_param(hystart_ack_delta_us, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -89,7 +89,7 @@ struct bictcp { u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay (msec << 3) */ + u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } -static inline u32 bictcp_clock(void) +static inline u32 bictcp_clock_us(const struct sock *sk) { -#if HZ < 1000 - return ktime_to_ms(ktime_get_real()); -#else - return jiffies_to_msecs(jiffies); -#endif + return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) @@ -131,9 +127,9 @@ static inline void bictcp_hystart_reset(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_ack = bictcp_clock(); + ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; - ca->curr_rtt = 0; + ca->curr_rtt = ~0U; ca->sample_cnt = 0; } @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) */ t = (s32)(tcp_jiffies32 - ca->epoch_start); - t += msecs_to_jiffies(ca->delay_min >> 3); + t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); @@ -376,22 +372,54 @@ static void bictcp_state(struct sock *sk, u8 new_state) } } +/* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, since during + * slow start we begin with small TSO packets and ca->delay_min would + * not account for long aggregation delay when TSO packets get bigger. + * Ideally even with a very small RTT we would like to have at least one + * TSO packet being sent and received by GRO, and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at this point. + * We cap the cushion to 1ms. + */ +static u32 hystart_ack_delay(struct sock *sk) +{ + unsigned long rate; + + rate = READ_ONCE(sk->sk_pacing_rate); + if (!rate) + return 0; + return min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); +} + static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - - if (ca->found & hystart_detect) - return; + u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now = bictcp_clock(); + u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { - ca->found |= HYSTART_ACK_TRAIN; + + threshold = ca->delay_min + hystart_ack_delay(sk); + + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((s32)(now - ca->round_start) > threshold) { + ca->found = 1; + pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", + now - ca->round_start, threshold, + ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -405,14 +433,14 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + if (ca->curr_rtt > delay) ca->curr_rtt = delay; ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { - ca->found |= HYSTART_DELAY; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), @@ -424,9 +452,6 @@ static void hystart_update(struct sock *sk, u32 delay) } } -/* Track delayed acknowledgment ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -441,7 +466,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; + delay = sample->rtt_us; if (delay == 0) delay = 1; @@ -450,7 +475,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tcp_in_slow_start(tp) && + if (!ca->found && tcp_in_slow_start(tp) && hystart && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0238b554a1f0..062b696a5fa1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3553,7 +3553,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; - if (unlikely(rexmit == 2)) { + if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c7326e04f9b..4adac9c75343 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -701,9 +701,21 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + const union tcp_md5_addr *addr; + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } else if (hash_location) { + const union tcp_md5_addr *addr; + int sdif = tcp_v4_sdif(skb); + int dif = inet_iif(skb); + int l3index; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -714,14 +726,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb), - tcp_v4_sdif(skb)); + ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ if (!sk1) goto out; - key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = sdif ? dif : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); if (!key) goto out; @@ -905,6 +920,9 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + const union tcp_md5_addr *addr; + int l3index; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -916,14 +934,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, - AF_INET), + tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } @@ -983,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); /* Find the Key structure for an address. */ -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { @@ -1003,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - + if (key->l3index && key->l3index != l3index) + continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); match = (key->addr.a4.s_addr & mask) == @@ -1027,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen) + int family, u8 prefixlen, + int l3index) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1046,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; + if (key->l3index && key->l3index != l3index) + continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) return key; @@ -1057,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; + int l3index; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; - return tcp_md5_do_lookup(sk, addr, AF_INET); + return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp) + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -1105,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->keylen = newkeylen; key->family = family; key->prefixlen = prefixlen; + key->l3index = l3index; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1114,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen) + u8 prefixlen, int l3index) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1149,7 +1176,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + const union tcp_md5_addr *addr; u8 prefixlen = 32; + int l3index = 0; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1167,16 +1196,34 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + + addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; + if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, - GFP_KERNEL); + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1286,7 +1333,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); /* Called with rcu_read_lock() */ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG /* @@ -1301,11 +1349,17 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + const union tcp_md5_addr *addr; unsigned char newhash[16]; + int genhash, l3index; - hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, - AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + addr = (union tcp_md5_addr *)&iph->saddr; + hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -1331,11 +1385,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" - : ""); + : "", l3index); return true; } return false; @@ -1419,7 +1473,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG + const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; + int l3index; #endif struct ip_options_rcu *inet_opt; @@ -1467,9 +1523,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); /* Copy over the MD5 key from the original socket */ - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET); + addr = (union tcp_md5_addr *)&newinet->inet_daddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key) { /* * We're using one, so create a matching key @@ -1477,8 +1534,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif @@ -1808,6 +1865,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); + int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1856,7 +1914,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1914,7 +1972,7 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v4_inbound_md5_hash(sk, skb)) + if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; nf_reset_ct(skb); @@ -2675,6 +2733,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c4848e7a0aad..279db8822439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk) if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tp->snd_cwnd >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk) } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { @@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ @@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk) if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); - val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ? + 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a3908e55ed89..b25e42100ceb 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -480,7 +480,7 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7bae6a91b487..b1e9a10e1133 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -370,6 +370,21 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, return call_fib6_notifier(nb, event_type, &info.info); } +static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + .nsiblings = nsiblings, + }; + + return call_fib6_notifier(nb, event_type, &info.info); +} + int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, @@ -400,6 +415,17 @@ int call_fib6_multipath_entry_notifiers(struct net *net, return call_fib6_notifiers(net, event_type, &info.info); } +int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) +{ + struct fib6_entry_notifier_info info = { + .rt = rt, + .nsiblings = rt->fib6_nsiblings, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); +} + struct fib6_dump_arg { struct net *net; struct notifier_block *nb; @@ -408,22 +434,29 @@ struct fib6_dump_arg { static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { - if (rt == arg->net->ipv6.fib6_null_entry) + enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; + int err; + + if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; - return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, - rt, arg->extack); + + if (rt->fib6_nsiblings) + err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, + rt, + rt->fib6_nsiblings, + arg->extack); + else + err = call_fib6_entry_notifier(arg->nb, fib_event, rt, + arg->extack); + + return err; } static int fib6_node_dump(struct fib6_walker *w) { - struct fib6_info *rt; - int err = 0; + int err; - for_each_fib6_walker_rt(w) { - err = fib6_rt_dump(rt, w->args); - if (err) - break; - } + err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } @@ -1039,6 +1072,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; @@ -1130,6 +1164,7 @@ next_iter: /* Find the first route that have the same metric */ sibling = leaf; + notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { @@ -1139,6 +1174,7 @@ next_iter: } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); + notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -1165,10 +1201,21 @@ next_iter: add: nlflags |= NLM_F_CREATE; - if (!info->skip_notify_kernel) { + /* The route should only be notified if it is the first + * route in the node or if it is added as a sibling + * route to the first route in the node. + */ + if (!info->skip_notify_kernel && + (notify_sibling_rt || ins == &fn->leaf)) { + enum fib_event_type fib_event; + + if (notify_sibling_rt) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); + fib_event, rt, + extack); if (err) { struct fib6_info *sibling, *next_sibling; @@ -1212,7 +1259,7 @@ add: return -ENOENT; } - if (!info->skip_notify_kernel) { + if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1845,13 +1892,29 @@ static struct fib6_node *fib6_repair_tree(struct net *net, static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { + struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; + bool notify_del = false; RT6_TRACE("fib6_del_route\n"); + /* If the deleted route is the first in the node and it is not part of + * a multipath route, then we need to replace it with the next route + * in the node, if exists. + */ + leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (leaf == rt && !rt->fib6_nsiblings) { + if (rcu_access_pointer(rt->fib6_next)) + replace_rt = rcu_dereference_protected(rt->fib6_next, + lockdep_is_held(&table->tb6_lock)); + else + notify_del = true; + } + /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; @@ -1869,6 +1932,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; + /* The route is deleted from a multipath route. If this + * multipath route is the first route in the node, then we need + * to emit a delete notification. Otherwise, we need to skip + * the notification. + */ + if (rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf)) + notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; @@ -1904,8 +1975,13 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - if (!info->skip_notify_kernel) - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) { + if (notify_del) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, + rt, NULL); + else if (replace_rt) + call_fib6_entry_notifiers_replace(net, replace_rt); + } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index affb51c11a25..0253b702afb7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3757,6 +3757,7 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; + struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); @@ -3772,12 +3773,32 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + /* 'rt' points to the first sibling route. If it is not the + * leaf, then we do not need to send a notification. Otherwise, + * we need to check if the last sibling has a next route or not + * and emit a replace or delete notification, respectively. + */ info->skip_notify_kernel = 1; - call_fib6_multipath_entry_notifiers(net, - FIB_EVENT_ENTRY_DEL, - rt, - rt->fib6_nsiblings, - NULL); + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&table->tb6_lock)); + if (rcu_access_pointer(fn->leaf) == rt) { + struct fib6_info *last_sibling, *replace_rt; + + last_sibling = list_last_entry(&rt->fib6_siblings, + struct fib6_info, + fib6_siblings); + replace_rt = rcu_dereference_protected( + last_sibling->fib6_next, + lockdep_is_held(&table->tb6_lock)); + if (replace_rt) + call_fib6_entry_notifiers_replace(net, + replace_rt); + else + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, rt->fib6_nsiblings, + NULL); + } list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -5025,12 +5046,37 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } +static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) +{ + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool should_notify = false; + struct fib6_info *leaf; + struct fib6_node *fn; + + rcu_read_lock(); + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + + leaf = rcu_dereference(fn->leaf); + if (!leaf) + goto out; + + if (rt == leaf || + (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf))) + should_notify = true; +out: + rcu_read_unlock(); + + return should_notify; +} + static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; - enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -5155,13 +5201,27 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } - event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; - err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, - rt_notif, nhn - 1, extack); - if (err) { - /* Delete all the siblings that were just added */ - err_nh = NULL; - goto add_errout; + /* An in-kernel notification should only be sent in case the new + * multipath route is added as the first route in the node, or if + * it was appended to it. We pass 'rt_notif' since it is the first + * sibling and might allow us to skip some checks in the replace case. + */ + if (ip6_route_mpath_should_notify(rt_notif)) { + enum fib_event_type fib_event; + + if (rt_notif->fib6_nsiblings != nhn - 1) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE; + + err = call_fib6_multipath_entry_notifiers(info->nl_net, + fib_event, rt_notif, + nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } } /* success ... tell user about new route */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index df5fd9109696..95e4e1e95db2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -81,7 +81,8 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #else static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr) + const struct in6_addr *addr, + int l3index) { return NULL; } @@ -532,15 +533,22 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr) + const struct in6_addr *addr, + int l3index) { - return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); + return tcp_md5_do_lookup(sk, l3index, + (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { - return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); + int l3index; + + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); + return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, + l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, @@ -548,6 +556,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; + int l3index = 0; u8 prefixlen; if (optlen < sizeof(cmd)) @@ -569,12 +578,30 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen); + AF_INET, prefixlen, + l3index); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen); + AF_INET6, prefixlen, l3index); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) @@ -582,12 +609,13 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, cmd.tcpm_key, - cmd.tcpm_keylen, GFP_KERNEL); + AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, + GFP_KERNEL); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, cmd.tcpm_key, - cmd.tcpm_keylen, GFP_KERNEL); + AF_INET6, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -698,17 +726,23 @@ clear_hash_noput: #endif static bool tcp_v6_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + int genhash, l3index; u8 newhash[16]; - hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr, l3index); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -732,10 +766,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), - &ip6h->daddr, ntohs(th->dest)); + &ip6h->daddr, ntohs(th->dest), l3index); return true; } #endif @@ -951,8 +985,18 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); } else if (hash_location) { + int dif = tcp_v6_iif_l3_slave(skb); + int sdif = tcp_v6_sdif(skb); + int l3index; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -964,13 +1008,16 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), - tcp_v6_iif_l3_slave(skb), - tcp_v6_sdif(skb)); + ntohs(th->source), dif, sdif); if (!sk1) goto out; - key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = tcp_v6_sdif(skb) ? dif : 0; + + key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key) goto out; @@ -1040,6 +1087,10 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + int l3index; + + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -1054,7 +1105,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), 0, 0, sk->sk_priority); } @@ -1126,6 +1177,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; + int l3index; #endif struct flowi6 fl6; @@ -1269,8 +1321,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); + /* Copy over the MD5 key from the original socket */ - key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1278,7 +1332,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, key->key, key->keylen, + AF_INET6, 128, l3index, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } #endif @@ -1480,6 +1534,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); + int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; @@ -1528,7 +1583,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (tcp_v6_inbound_md5_hash(sk, skb)) { + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1583,7 +1638,7 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v6_inbound_md5_hash(sk, skb)) + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; if (tcp_filter(sk, skb)) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 64b8f05d6735..f0d5fc27d0b5 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -127,7 +127,7 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, ip6_gro_compute_pseudo); skip: diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index f82ea12bac37..c99223cb3338 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -122,8 +122,6 @@ static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) static inline struct l2tp_net *l2tp_pernet(const struct net *net) { - BUG_ON(!net); - return net_generic(net, l2tp_net_id); } diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 0187e65176c0..ba9ae482141b 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -369,7 +369,15 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) eh = skb_push(nr->cmd, sizeof(*eh)); eh->h_proto = htons(ETH_P_NCSI); eth_broadcast_addr(eh->h_dest); - eth_broadcast_addr(eh->h_source); + + /* If mac address received from device then use it for + * source address as unicast address else use broadcast + * address as source address + */ + if (nca->ndp->gma_flag == 1) + memcpy(eh->h_source, nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + else + eth_broadcast_addr(eh->h_source); /* Start the timer for the request that might not have * corresponding response. Given NCSI is an internal diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 70fe02697544..e20b81514029 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -764,9 +764,6 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) return -1; } - /* Set the flag for GMA command which should only be called once */ - nca->ndp->gma_flag = 1; - /* Get Mac address from NCSI device */ return nch->handler(nca); } diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index d5611f04926d..a94bb59793f0 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -627,6 +627,9 @@ static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -671,6 +674,9 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) return -ENXIO; + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f475fec84536..f4c4b467c87e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2334,7 +2334,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return nf_conntrack_hash_resize(hashsize); } -EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); static __always_inline unsigned int total_extension_size(void) { diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index c24e5b64b00c..3dbe2329c3f1 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -37,7 +37,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct) kfree(ct->ext); } -EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 9740b554fdb3..951b6e87ed5d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> @@ -33,8 +34,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -static u8 nft_meta_weekday(time64_t secs) +static u8 nft_meta_weekday(void) { + time64_t secs = ktime_get_real_seconds(); unsigned int dse; u8 wday; @@ -56,14 +58,264 @@ static u32 nft_meta_hour(time64_t secs) + tm.tm_sec; } +static noinline_for_stack void +nft_meta_get_eval_time(enum nft_meta_keys key, + u32 *dest) +{ + switch (key) { + case NFT_META_TIME_NS: + nft_reg_store64(dest, ktime_get_real_ns()); + break; + case NFT_META_TIME_DAY: + nft_reg_store8(dest, nft_meta_weekday()); + break; + case NFT_META_TIME_HOUR: + *dest = nft_meta_hour(ktime_get_real_seconds()); + break; + default: + break; + } +} + +static noinline bool +nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, + u32 *dest) +{ + const struct sk_buff *skb = pkt->skb; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + break; + case NFPROTO_IPV6: + nft_reg_store8(dest, PACKET_MULTICAST); + break; + case NFPROTO_NETDEV: + switch (skb->protocol) { + case htons(ETH_P_IP): { + int noff = skb_network_offset(skb); + struct iphdr *iph, _iph; + + iph = skb_header_pointer(skb, noff, + sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ipv4_is_multicast(iph->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + + break; + } + case htons(ETH_P_IPV6): + nft_reg_store8(dest, PACKET_MULTICAST); + break; + default: + WARN_ON_ONCE(1); + return false; + } + break; + default: + WARN_ON_ONCE(1); + return false; + } + + return true; +} + +static noinline bool +nft_meta_get_eval_skugid(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct socket *sock; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + read_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + if (!sock || !sock->file) { + read_unlock_bh(&sk->sk_callback_lock); + return false; + } + + switch (key) { + case NFT_META_SKUID: + *dest = from_kuid_munged(&init_user_ns, + sock->file->f_cred->fsuid); + break; + case NFT_META_SKGID: + *dest = from_kgid_munged(&init_user_ns, + sock->file->f_cred->fsgid); + break; + default: + break; + } + + read_unlock_bh(&sk->sk_callback_lock); + return true; +} + +#ifdef CONFIG_CGROUP_NET_CLASSID +static noinline bool +nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); + return true; +} +#endif + +static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + + switch (key) { + case NFT_META_IIFKIND: + if (!in || !in->rtnl_link_ops) + return false; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (!out || !out->rtnl_link_ops) + return false; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + default: + return false; + } + + return true; +} + +static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) +{ + *dest = dev ? dev->ifindex : 0; +} + +static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) +{ + strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); +} + +static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + nft_reg_store16(dest, dev->type); + return true; +} + +static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + *dest = dev->group; + return true; +} + +static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, + const struct nft_pktinfo *pkt) +{ + switch (key) { + case NFT_META_IIFNAME: + nft_meta_store_ifname(dest, nft_in(pkt)); + break; + case NFT_META_OIFNAME: + nft_meta_store_ifname(dest, nft_out(pkt)); + break; + case NFT_META_IIF: + nft_meta_store_ifindex(dest, nft_in(pkt)); + break; + case NFT_META_OIF: + nft_meta_store_ifindex(dest, nft_out(pkt)); + break; + case NFT_META_IIFTYPE: + if (!nft_meta_store_iftype(dest, nft_in(pkt))) + return false; + break; + case NFT_META_OIFTYPE: + if (!nft_meta_store_iftype(dest, nft_out(pkt))) + return false; + break; + case NFT_META_IIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + case NFT_META_OIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + default: + return false; + } + + return true; +} + +static noinline u32 nft_prandom_u32(void) +{ + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + + return prandom_u32_state(state); +} + +#ifdef CONFIG_IP_ROUTE_CLASSID +static noinline bool +nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) +{ + const struct dst_entry *dst = skb_dst(skb); + + if (!dst) + return false; + + *dest = dst->tclassid; + return true; +} +#endif + +static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return inet_sdif(pkt->skb); + case NFPROTO_IPV6: + return inet6_sdif(pkt->skb); + } + + return 0; +} + +static noinline void +nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt) +{ + u32 sdif = nft_meta_get_eval_sdif(pkt); + const struct net_device *dev; + + dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL; + nft_meta_store_ifname(dest, dev); +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -88,69 +340,26 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->mark; break; case NFT_META_IIF: - *dest = in ? in->ifindex : 0; - break; case NFT_META_OIF: - *dest = out ? out->ifindex : 0; - break; case NFT_META_IIFNAME: - strncpy((char *)dest, in ? in->name : "", IFNAMSIZ); - break; case NFT_META_OIFNAME: - strncpy((char *)dest, out ? out->name : "", IFNAMSIZ); - break; case NFT_META_IIFTYPE: - if (in == NULL) - goto err; - nft_reg_store16(dest, in->type); - break; case NFT_META_OIFTYPE: - if (out == NULL) + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: + if (!nft_meta_get_eval_ifname(priv->key, dest, pkt)) goto err; - nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - - *dest = from_kuid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&sk->sk_callback_lock); - break; case NFT_META_SKGID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_skugid(priv->key, dest, pkt)) goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - *dest = from_kgid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID - case NFT_META_RTCLASSID: { - const struct dst_entry *dst = skb_dst(skb); - - if (dst == NULL) + case NFT_META_RTCLASSID: + if (!nft_meta_get_eval_rtclassid(skb, dest)) goto err; - *dest = dst->tclassid; break; - } #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: @@ -163,97 +372,41 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - break; - case NFPROTO_IPV6: - nft_reg_store8(dest, PACKET_MULTICAST); - break; - case NFPROTO_NETDEV: - switch (skb->protocol) { - case htons(ETH_P_IP): { - int noff = skb_network_offset(skb); - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, noff, - sizeof(_iph), &_iph); - if (!iph) - goto err; - - if (ipv4_is_multicast(iph->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - - break; - } - case htons(ETH_P_IPV6): - nft_reg_store8(dest, PACKET_MULTICAST); - break; - default: - WARN_ON_ONCE(1); - goto err; - } - break; - default: - WARN_ON_ONCE(1); + if (!nft_meta_get_eval_pkttype_lo(pkt, dest)) goto err; - } break; case NFT_META_CPU: *dest = raw_smp_processor_id(); break; - case NFT_META_IIFGROUP: - if (in == NULL) - goto err; - *dest = in->group; - break; - case NFT_META_OIFGROUP: - if (out == NULL) - goto err; - *dest = out->group; - break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_cgroup(dest, pkt)) goto err; - *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif - case NFT_META_PRANDOM: { - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - *dest = prandom_u32_state(state); + case NFT_META_PRANDOM: + *dest = nft_prandom_u32(); break; - } #ifdef CONFIG_XFRM case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); break; #endif case NFT_META_IIFKIND: - if (in == NULL || in->rtnl_link_ops == NULL) - goto err; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); - break; case NFT_META_OIFKIND: - if (out == NULL || out->rtnl_link_ops == NULL) + if (!nft_meta_get_eval_kind(priv->key, dest, pkt)) goto err; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); - break; case NFT_META_TIME_DAY: - nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds())); - break; case NFT_META_TIME_HOUR: - *dest = nft_meta_hour(ktime_get_real_seconds()); + nft_meta_get_eval_time(priv->key, dest); + break; + case NFT_META_SDIF: + *dest = nft_meta_get_eval_sdif(pkt); + break; + case NFT_META_SDIFNAME: + nft_meta_get_eval_sdifname(dest, pkt); break; default: WARN_ON(1); @@ -335,6 +488,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: + case NFT_META_SDIF: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID @@ -356,6 +510,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: case NFT_META_IIFKIND: case NFT_META_OIFKIND: + case NFT_META_SDIFNAME: len = IFNAMSIZ; break; case NFT_META_PRANDOM: @@ -386,16 +541,28 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx) { -#ifdef CONFIG_XFRM - const struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; - if (priv->key != NFT_META_SECPATH) - return 0; + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + +static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) +{ +#ifdef CONFIG_XFRM + unsigned int hooks; switch (ctx->family) { case NFPROTO_NETDEV: @@ -418,6 +585,25 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + switch (priv->key) { + case NFT_META_SECPATH: + return nft_meta_get_validate_xfrm(ctx); + case NFT_META_SDIF: + case NFT_META_SDIFNAME: + return nft_meta_get_validate_sdif(ctx); + default: + break; + } + + return 0; +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3d4c2ae605a8..23cd163689d5 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, - [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; @@ -442,10 +443,15 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || - nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || - nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) + if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, + &info->key.u.ipv6.src) < 0 || + nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, + &info->key.u.ipv6.dst) < 0 || + nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + info->key.label)) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } else { @@ -453,9 +459,13 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || - nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) + if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, + info->key.u.ipv4.src) < 0 || + nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, + info->key.u.ipv4.dst) < 0) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } @@ -467,42 +477,58 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_obj *priv) { struct nft_tunnel_opts *opts = &priv->opts; - struct nlattr *nest; + struct nlattr *nest, *inner; nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; if (opts->flags & TUNNEL_VXLAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); + if (!inner) + goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) - return -1; + goto inner_failure; + nla_nest_end(skb, inner); } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); + if (!inner) + goto failure; + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, + htonl(opts->u.erspan.version))) + goto inner_failure; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, opts->u.erspan.u.index)) - return -1; + goto inner_failure; break; case ERSPAN_VERSION2: if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, get_hwid(&opts->u.erspan.u.md2)) || nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, opts->u.erspan.u.md2.dir)) - return -1; + goto inner_failure; break; } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); - return 0; + +inner_failure: + nla_nest_cancel(skb, inner); +failure: + nla_nest_cancel(skb, nest); + return -1; } static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { - if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || - nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 4c8395462303..7fbfe2adfffa 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -161,16 +161,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct nlattr *attr, int len); static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_action_push_mpls *mpls) + __be32 mpls_lse, __be16 mpls_ethertype, __u16 mac_len) { int err; - err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype, - skb->mac_len, - ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); + err = skb_mpls_push(skb, mpls_lse, mpls_ethertype, mac_len, !!mac_len); if (err) return err; + if (!mac_len) + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); return 0; } @@ -185,6 +186,9 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (err) return err; + if (ethertype == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + invalidate_flow_key(key); return 0; } @@ -1229,10 +1233,24 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; - case OVS_ACTION_ATTR_PUSH_MPLS: - err = push_mpls(skb, key, nla_data(a)); + case OVS_ACTION_ATTR_PUSH_MPLS: { + struct ovs_action_push_mpls *mpls = nla_data(a); + + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, skb->mac_len); break; + } + case OVS_ACTION_ATTR_ADD_MPLS: { + struct ovs_action_add_mpls *mpls = nla_data(a); + __u16 mac_len = 0; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) + mac_len = skb->mac_len; + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, mac_len); + break; + } case OVS_ACTION_ATTR_POP_MPLS: err = pop_mpls(skb, key, nla_get_be16(a)); break; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 65c2e3458ff5..7da4230627f5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -79,6 +79,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_METER: case OVS_ACTION_ATTR_CHECK_PKT_LEN: + case OVS_ACTION_ATTR_ADD_MPLS: default: return true; } @@ -3005,6 +3006,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_METER] = sizeof(u32), [OVS_ACTION_ATTR_CLONE] = (u32)-1, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, + [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3072,6 +3074,33 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_ADD_MPLS: { + const struct ovs_action_add_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) { + if (vlan_tci & htons(VLAN_CFI_MASK) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + mpls_label_count++; + } else { + if (mac_proto == MAC_PROTO_ETHERNET) { + mpls_label_count = 1; + mac_proto = MAC_PROTO_NONE; + } else { + mpls_label_count++; + } + } + eth_type = mpls->mpls_ethertype; + break; + } + case OVS_ACTION_ATTR_PUSH_MPLS: { const struct ovs_action_push_mpls *mpls = nla_data(a); @@ -3109,6 +3138,11 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, * recirculation. */ proto = nla_get_be16(a); + + if (proto == htons(ETH_P_TEB) && + mac_proto != MAC_PROTO_NONE) + return -EINVAL; + mpls_label_count--; if (!eth_p_mpls(proto) || !mpls_label_count) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 118cd66b7516..3bec515ccde3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -520,7 +520,7 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; - unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; + unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; @@ -532,31 +532,25 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); - if (!err) { - /* - * If the link speed is so slow you don't really - * need to worry about perf anyways - */ - if (ecmd.base.speed < SPEED_1000 || - ecmd.base.speed == SPEED_UNKNOWN) { - return DEFAULT_PRB_RETIRE_TOV; - } else { - msec = 1; - div = ecmd.base.speed / 1000; - } - } else + if (err) return DEFAULT_PRB_RETIRE_TOV; + /* If the link speed is so slow you don't really + * need to worry about perf anyways + */ + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) + return DEFAULT_PRB_RETIRE_TOV; + + div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; - tmo = mbits * msec; - if (div) - return tmo+1; - return tmo; + return mbits + 1; + return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 49bd76a87461..ac0fae06cc15 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -35,8 +35,6 @@ static unsigned int phonet_net_id __read_mostly; static struct phonet_net *phonet_pernet(struct net *net) { - BUG_ON(!net); - return net_generic(net, phonet_net_id); } diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index c53307623236..5277631fa14c 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -696,7 +696,6 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; - failed = 0; goto out; } failed = 1; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2985509147a2..b1e7ec726958 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -409,6 +409,23 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. +config NET_SCH_ETS + tristate "Enhanced transmission selection scheduler (ETS)" + help + The Enhanced Transmission Selection scheduler is a classful + queuing discipline that merges functionality of PRIO and DRR + qdiscs in one scheduler. ETS makes it easy to configure a set of + strict and bandwidth-sharing bands to implement the transmission + selection described in 802.1Qaz. + + Say Y here if you want to use the ETS packet scheduling + algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_ets. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 415d1e1f237e..bc8856b865ff 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o +obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 2277369feae5..90ef7cc79b69 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -173,8 +173,7 @@ struct cake_tin_data { u64 tin_rate_bps; u16 tin_rate_shft; - u16 tin_quantum_prio; - u16 tin_quantum_band; + u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; @@ -1919,7 +1918,7 @@ begin: while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) - b->tin_deficit += b->tin_quantum_band; + b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; @@ -2241,8 +2240,7 @@ static int cake_config_besteffort(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_band = 65535; - b->tin_quantum_prio = 65535; + b->tin_quantum = 65535; return 0; } @@ -2253,8 +2251,7 @@ static int cake_config_precedence(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2267,18 +2264,14 @@ static int cake_config_precedence(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2347,8 +2340,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2364,18 +2356,14 @@ static int cake_config_diffserv8(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2414,17 +2402,11 @@ static int cake_config_diffserv4(struct Qdisc *sch) cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 2; - q->tins[3].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 1; - q->tins[3].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 1; + q->tins[3].tin_quantum = quantum >> 2; return 0; } @@ -2455,15 +2437,10 @@ static int cake_config_diffserv3(struct Qdisc *sch) cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 2; return 0; } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c new file mode 100644 index 000000000000..a87e9159338c --- /dev/null +++ b/net/sched/sch_ets.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * net/sched/sch_ets.c Enhanced Transmission Selection scheduler + * + * Description + * ----------- + * + * The Enhanced Transmission Selection scheduler is a classful queuing + * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler. + * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to + * implement the transmission selection described in 802.1Qaz. + * + * Although ETS is technically classful, it's not possible to add and remove + * classes at will. Instead one specifies number of classes, how many are + * PRIO-like and how many DRR-like, and quanta for the latter. + * + * Algorithm + * --------- + * + * The strict classes, if any, are tried for traffic first: first band 0, if it + * has no traffic then band 1, etc. + * + * When there is no traffic in any of the strict queues, the bandwidth-sharing + * ones are tried next. Each band is assigned a deficit counter, initialized to + * "quantum" of that band. ETS maintains a list of active bandwidth-sharing + * bands whose qdiscs are non-empty. A packet is dequeued from the band at the + * head of the list if the packet size is smaller or equal to the deficit + * counter. If the counter is too small, it is increased by "quantum" and the + * scheduler moves on to the next band in the active list. + */ + +#include <linux/module.h> +#include <net/gen_stats.h> +#include <net/netlink.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> + +struct ets_class { + struct list_head alist; /* In struct ets_sched.active. */ + struct Qdisc *qdisc; + u32 quantum; + u32 deficit; + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; +}; + +struct ets_sched { + struct list_head active; + struct tcf_proto __rcu *filter_list; + struct tcf_block *block; + unsigned int nbands; + unsigned int nstrict; + u8 prio2band[TC_PRIO_MAX + 1]; + struct ets_class classes[TCQ_ETS_MAX_BANDS]; +}; + +static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_NBANDS] = { .type = NLA_U8 }, + [TCA_ETS_NSTRICT] = { .type = NLA_U8 }, + [TCA_ETS_QUANTA] = { .type = NLA_NESTED }, + [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 }, +}; + +static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, + unsigned int *quantum, + struct netlink_ext_ack *extack) +{ + *quantum = nla_get_u32(attr); + if (!*quantum) { + NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero"); + return -EINVAL; + } + return 0; +} + +static struct ets_class * +ets_class_from_arg(struct Qdisc *sch, unsigned long arg) +{ + struct ets_sched *q = qdisc_priv(sch); + + return &q->classes[arg - 1]; +} + +static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) +{ + struct ets_sched *q = qdisc_priv(sch); + int band = cl - q->classes; + + return TC_H_MAKE(sch->handle, band + 1); +} + +static void ets_offload_change(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct ets_sched *q = qdisc_priv(sch); + struct tc_ets_qopt_offload qopt; + unsigned int w_psum_prev = 0; + unsigned int q_psum = 0; + unsigned int q_sum = 0; + unsigned int quantum; + unsigned int w_psum; + unsigned int weight; + unsigned int i; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.bands = q->nbands; + qopt.replace_params.qstats = &sch->qstats; + memcpy(&qopt.replace_params.priomap, + q->prio2band, sizeof(q->prio2band)); + + for (i = 0; i < q->nbands; i++) + q_sum += q->classes[i].quantum; + + for (i = 0; i < q->nbands; i++) { + quantum = q->classes[i].quantum; + q_psum += quantum; + w_psum = quantum ? q_psum * 100 / q_sum : 0; + weight = w_psum - w_psum_prev; + w_psum_prev = w_psum; + + qopt.replace_params.quanta[i] = quantum; + qopt.replace_params.weights[i] = weight; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc *old, unsigned long arg, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_GRAFT; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.graft_params.band = arg - 1; + qopt.graft_params.child_handle = new->handle; + + qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS, + &qopt, extack); +} + +static int ets_offload_dump(struct Qdisc *sch) +{ + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt); +} + +static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) +{ + unsigned int band = cl - q->classes; + + return band < q->nstrict; +} + +static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, *arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int quantum; + int err; + + /* Classes can be added and removed only through Qdisc_ops.change + * interface. + */ + if (!cl) { + NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported"); + return -EOPNOTSUPP; + } + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_QUANTA_BAND]) + /* Nothing to configure. */ + return 0; + + if (ets_class_is_strict(q, cl)) { + NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum"); + return -EINVAL; + } + + err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum, + extack); + if (err) + return err; + + sch_tree_lock(sch); + cl->quantum = quantum; + sch_tree_unlock(sch); + + ets_offload_change(sch); + return 0; +} + +static int ets_class_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, cl), NULL); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } + + *old = qdisc_replace(sch, new, &cl->qdisc); + ets_offload_graft(sch, new, *old, arg, extack); + return 0; +} + +static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + return cl->qdisc; +} + +static unsigned long ets_class_find(struct Qdisc *sch, u32 classid) +{ + unsigned long band = TC_H_MIN(classid); + struct ets_sched *q = qdisc_priv(sch); + + if (band - 1 >= q->nbands) + return 0; + return band; +} + +static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + + /* We get notified about zero-length child Qdiscs as well if they are + * offloaded. Those aren't on the active list though, so don't attempt + * to remove them. + */ + if (!ets_class_is_strict(q, cl) && sch->q.qlen) + list_del(&cl->alist); +} + +static int ets_class_dump(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = ets_class_id(sch, cl); + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + if (!ets_class_is_strict(q, cl)) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum)) + goto nla_put_failure; + } + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct Qdisc *cl_q = cl->qdisc; + + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + + return 0; +} + +static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct ets_sched *q = qdisc_priv(sch); + int i; + + if (arg->stop) + return; + + for (i = 0; i < q->nbands; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_block * +ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + + if (cl) { + NL_SET_ERR_MSG(extack, "ETS classid must be zero"); + return NULL; + } + + return q->block; +} + +static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return ets_class_find(sch, classid); +} + +static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ +} + +static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct ets_sched *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + struct tcf_proto *fl; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + if (TC_H_MAJ(skb->priority) != sch->handle) { + fl = rcu_dereference_bh(q->filter_list); + err = tcf_classify(skb, fl, &res, false); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return NULL; + } +#endif + if (!fl || err < 0) { + if (TC_H_MAJ(band)) + band = 0; + return &q->classes[q->prio2band[band & TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band >= q->nbands) + return &q->classes[q->prio2band[0]]; + return &q->classes[band]; +} + +static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + unsigned int len = qdisc_pkt_len(skb); + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + int err = 0; + bool first; + + cl = ets_classify(skb, sch, &err); + if (!cl) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + first = !cl->qdisc->q.qlen; + err = qdisc_enqueue(skb, cl->qdisc, to_free); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + qdisc_qstats_drop(sch); + } + return err; + } + + if (first && !ets_class_is_strict(q, cl)) { + list_add_tail(&cl->alist, &q->active); + cl->deficit = cl->quantum; + } + + sch->qstats.backlog += len; + sch->q.qlen++; + return err; +} + +static struct sk_buff * +ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + return skb; +} + +static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + struct sk_buff *skb; + unsigned int band; + unsigned int len; + + while (1) { + for (band = 0; band < q->nstrict; band++) { + cl = &q->classes[band]; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (skb) + return ets_qdisc_dequeue_skb(sch, skb); + } + + if (list_empty(&q->active)) + goto out; + + cl = list_first_entry(&q->active, struct ets_class, alist); + skb = cl->qdisc->ops->peek(cl->qdisc); + if (!skb) { + qdisc_warn_nonwc(__func__, cl->qdisc); + goto out; + } + + len = qdisc_pkt_len(skb); + if (len <= cl->deficit) { + cl->deficit -= len; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (unlikely(!skb)) + goto out; + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + return ets_qdisc_dequeue_skb(sch, skb); + } + + cl->deficit += cl->quantum; + list_move_tail(&cl->alist, &q->active); + } +out: + return NULL; +} + +static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr, + unsigned int nbands, u8 *priomap, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int prio = 0; + u8 band; + int rem; + int err; + + err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX, + ets_priomap_policy, NL_VALIDATE_STRICT, + extack); + if (err) + return err; + + nla_for_each_nested(attr, priomap_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_PRIOMAP_BAND: + if (prio > TC_PRIO_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap"); + return -EINVAL; + } + band = nla_get_u8(attr); + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap"); + return -EINVAL; + } + priomap[prio++] = band; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr, + unsigned int nbands, unsigned int nstrict, + unsigned int *quanta, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int band = nstrict; + int rem; + int err; + + err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX, + ets_quanta_policy, NL_VALIDATE_STRICT, + extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, quanta_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_QUANTA_BAND: + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands"); + return -EINVAL; + } + err = ets_quantum_parse(sch, attr, &quanta[band++], + extack); + if (err) + return err; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0}; + struct Qdisc *queues[TCQ_ETS_MAX_BANDS]; + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int oldbands = q->nbands; + u8 priomap[TC_PRIO_MAX + 1]; + unsigned int nstrict = 0; + unsigned int nbands; + unsigned int i; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_NBANDS]) { + NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument"); + return -EINVAL; + } + nbands = nla_get_u8(tb[TCA_ETS_NBANDS]); + if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands"); + return -EINVAL; + } + /* Unless overridden, traffic goes to the last band. */ + memset(priomap, nbands - 1, sizeof(priomap)); + + if (tb[TCA_ETS_NSTRICT]) { + nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]); + if (nstrict > nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands"); + return -EINVAL; + } + } + + if (tb[TCA_ETS_PRIOMAP]) { + err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP], + nbands, priomap, extack); + if (err) + return err; + } + + if (tb[TCA_ETS_QUANTA]) { + err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA], + nbands, nstrict, quanta, extack); + if (err) + return err; + } + /* If there are more bands than strict + quanta provided, the remaining + * ones are ETS with quantum of MTU. Initialize the missing values here. + */ + for (i = nstrict; i < nbands; i++) { + if (!quanta[i]) + quanta[i] = psched_mtu(qdisc_dev(sch)); + } + + /* Before commit, make sure we can allocate all new qdiscs */ + for (i = oldbands; i < nbands; i++) { + queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, &q->classes[i]), + extack); + if (!queues[i]) { + while (i > oldbands) + qdisc_put(queues[--i]); + return -ENOMEM; + } + } + + sch_tree_lock(sch); + + q->nbands = nbands; + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); + + for (i = q->nbands; i < oldbands; i++) + qdisc_tree_flush_backlog(q->classes[i].qdisc); + + for (i = 0; i < q->nbands; i++) + q->classes[i].quantum = quanta[i]; + + for (i = oldbands; i < q->nbands; i++) { + q->classes[i].qdisc = queues[i]; + if (q->classes[i].qdisc != &noop_qdisc) + qdisc_hash_add(q->classes[i].qdisc, true); + } + + sch_tree_unlock(sch); + + ets_offload_change(sch); + for (i = q->nbands; i < oldbands; i++) { + qdisc_put(q->classes[i].qdisc); + memset(&q->classes[i], 0, sizeof(q->classes[i])); + } + return 0; +} + +static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + int err; + + if (!opt) + return -EINVAL; + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + INIT_LIST_HEAD(&q->active); + return ets_qdisc_change(sch, opt, extack); +} + +static void ets_qdisc_reset(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + for (band = q->nstrict; band < q->nbands; band++) { + if (q->classes[band].qdisc->q.qlen) + list_del(&q->classes[band].alist); + } + for (band = 0; band < q->nbands; band++) + qdisc_reset(q->classes[band].qdisc); + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + +static void ets_qdisc_destroy(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + ets_offload_destroy(sch); + tcf_block_put(q->block); + for (band = 0; band < q->nbands; band++) + qdisc_put(q->classes[band].qdisc); +} + +static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opts; + struct nlattr *nest; + int band; + int prio; + int err; + + err = ets_offload_dump(sch); + if (err) + return err; + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_err; + + if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + goto nla_err; + + if (q->nstrict && + nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + goto nla_err; + + if (q->nbands > q->nstrict) { + nest = nla_nest_start(skb, TCA_ETS_QUANTA); + if (!nest) + goto nla_err; + + for (band = q->nstrict; band < q->nbands; band++) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, + q->classes[band].quantum)) + goto nla_err; + } + + nla_nest_end(skb, nest); + } + + nest = nla_nest_start(skb, TCA_ETS_PRIOMAP); + if (!nest) + goto nla_err; + + for (prio = 0; prio <= TC_PRIO_MAX; prio++) { + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + goto nla_err; + } + + nla_nest_end(skb, nest); + + return nla_nest_end(skb, opts); + +nla_err: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static const struct Qdisc_class_ops ets_class_ops = { + .change = ets_class_change, + .graft = ets_class_graft, + .leaf = ets_class_leaf, + .find = ets_class_find, + .qlen_notify = ets_class_qlen_notify, + .dump = ets_class_dump, + .dump_stats = ets_class_dump_stats, + .walk = ets_qdisc_walk, + .tcf_block = ets_qdisc_tcf_block, + .bind_tcf = ets_qdisc_bind_tcf, + .unbind_tcf = ets_qdisc_unbind_tcf, +}; + +static struct Qdisc_ops ets_qdisc_ops __read_mostly = { + .cl_ops = &ets_class_ops, + .id = "ets", + .priv_size = sizeof(struct ets_sched), + .enqueue = ets_qdisc_enqueue, + .dequeue = ets_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .change = ets_qdisc_change, + .init = ets_qdisc_init, + .reset = ets_qdisc_reset, + .destroy = ets_qdisc_destroy, + .dump = ets_qdisc_dump, + .owner = THIS_MODULE, +}; + +static int __init ets_init(void) +{ + return register_qdisc(&ets_qdisc_ops); +} + +static void __exit ets_exit(void) +{ + unregister_qdisc(&ets_qdisc_ops); +} + +module_init(ets_init); +module_exit(ets_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5ab696efca95..6c9595f1048a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -441,7 +441,7 @@ static void dev_watchdog(struct timer_list *t) trace_net_dev_xmit_timeout(dev, i); WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", dev->name, netdev_drivername(dev), i); - dev->netdev_ops->ndo_tx_timeout(dev); + dev->netdev_ops->ndo_tx_timeout(dev, i); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bbd5004a5d09..437079a4883d 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -584,7 +584,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, const gfp_t gfp, const int peer_state) { - struct net *net = sock_net(asoc->base.sk); struct sctp_transport *peer; struct sctp_sock *sp; unsigned short port; @@ -614,7 +613,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } - peer = sctp_transport_new(net, addr, gfp); + peer = sctp_transport_new(asoc->base.net, addr, gfp); if (!peer) return NULL; @@ -974,7 +973,7 @@ static void sctp_assoc_bh_rcv(struct work_struct *work) struct sctp_association *asoc = container_of(work, struct sctp_association, base.inqueue.immediate); - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; union sctp_subtype subtype; struct sctp_endpoint *ep; struct sctp_chunk *chunk; @@ -1442,7 +1441,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Should we send a SACK to update our peer? */ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) { - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; + switch (asoc->state) { case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: @@ -1576,7 +1576,7 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, if (asoc->peer.ipv6_address) flags |= SCTP_ADDR6_PEERSUPP; - return sctp_bind_addr_copy(sock_net(asoc->base.sk), + return sctp_bind_addr_copy(asoc->base.net, &asoc->base.bind_addr, &asoc->ep->base.bind_addr, scope, gfp, flags); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index cc3ce5d80b08..ab6a997e222f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -225,7 +225,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) - SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 3ccab7440c9e..48c9c2c7602f 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -244,7 +244,7 @@ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct sctp_endpoint *retval = NULL; if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) && - net_eq(sock_net(ep->base.sk), net)) { + net_eq(ep->base.net, net)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; @@ -292,8 +292,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct sctp_sockaddr_entry *addr; + struct net *net = ep->base.net; struct sctp_bind_addr *bp; - struct net *net = sock_net(ep->base.sk); bp = &ep->base.bind_addr; /* This function is called with the socket lock held, @@ -384,7 +384,7 @@ normal: if (asoc && sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { - SCTP_INC_STATS(sock_net(ep->base.sk), SCTP_MIB_INCTRLCHUNKS); + SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS); if (asoc) asoc->stats.ictrlchunks++; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 4d2bcfc9d7f8..efaaefc3bb1c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -937,7 +937,7 @@ int sctp_hash_transport(struct sctp_transport *t) if (t->asoc->temp) return 0; - arg.net = sock_net(t->asoc->base.sk); + arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); @@ -1004,12 +1004,11 @@ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { - struct net *net = sock_net(ep->base.sk); struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, - .net = net, + .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; diff --git a/net/sctp/output.c b/net/sctp/output.c index dbda7e7927fd..1441eaf460bb 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -282,7 +282,7 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, sctp_chunk_free(sack); goto out; } - SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 0dab62b67b9a..577e3bc4ee6f 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -36,6 +36,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> +#include <trace/events/sctp.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -279,7 +280,7 @@ void sctp_outq_free(struct sctp_outq *q) /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { - struct net *net = sock_net(q->asoc->base.sk); + struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? @@ -533,7 +534,7 @@ void sctp_retransmit_mark(struct sctp_outq *q, void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { - struct net *net = sock_net(q->asoc->base.sk); + struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: @@ -1238,6 +1239,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; + /* SCTP path tracepoint for congestion control debugging. */ + if (trace_sctp_probe_path_enabled()) { + list_for_each_entry(transport, transport_list, transports) + trace_sctp_probe_path(transport, asoc); + } + sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; @@ -1884,6 +1891,6 @@ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); - SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 48d63956a68c..09050c1d5517 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2307,7 +2307,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; @@ -2363,8 +2362,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * also give us an option to silently ignore the packet, which * is what we'll do here. */ - if (!net->sctp.addip_noauth && - (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { + if (!asoc->base.net->sctp.addip_noauth && + (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); @@ -2491,9 +2490,9 @@ static int sctp_process_param(struct sctp_association *asoc, const union sctp_addr *peer_addr, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; + struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 834e9f82afed..2bc29463e1dc 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -516,8 +516,6 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, struct sctp_transport *transport, int is_hb) { - struct net *net = sock_net(asoc->base.sk); - /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ @@ -544,10 +542,10 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ - if (net->sctp.pf_enable && - (transport->state == SCTP_ACTIVE) && - (transport->error_count < transport->pathmaxrxt) && - (transport->error_count > transport->pf_retrans)) { + if (asoc->base.net->sctp.pf_enable && + transport->state == SCTP_ACTIVE && + transport->error_count < transport->pathmaxrxt && + transport->error_count > transport->pf_retrans) { sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_PF, @@ -798,10 +796,8 @@ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds, int err = 0; if (sctp_outq_sack(&asoc->outqueue, chunk)) { - struct net *net = sock_net(asoc->base.sk); - /* There are no more TSNs awaiting SACK. */ - err = sctp_do_sm(net, SCTP_EVENT_T_OTHER, + err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), asoc->state, asoc->ep, asoc, NULL, GFP_ATOMIC); @@ -834,7 +830,7 @@ static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_association *new) { - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; struct sctp_chunk *abort; if (!sctp_assoc_update(asoc, new)) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4ab8208a2dd4..748e3b19ec1d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1320,7 +1320,7 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, struct sctp_chunk *init, struct sctp_cmd_seq *commands) { - struct net *net = sock_net(new_asoc->base.sk); + struct net *net = new_asoc->base.net; struct sctp_transport *new_addr; int ret = 1; @@ -3281,8 +3281,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, struct sctp_sackhdr *sackh; __u32 ctsn; - trace_sctp_probe(ep, asoc, chunk); - if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -3299,6 +3297,15 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, chunk->subh.sack_hdr = sackh; ctsn = ntohl(sackh->cum_tsn_ack); + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (TSN_lte(asoc->next_tsn, ctsn)) + return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); + + trace_sctp_probe(ep, asoc, chunk); + /* i) If Cumulative TSN Ack is less than the Cumulative TSN * Ack Point, then drop the SACK. Since Cumulative TSN * Ack is monotonically increasing, a SACK whose @@ -3312,13 +3319,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, return SCTP_DISPOSITION_DISCARD; } - /* If Cumulative TSN Ack beyond the max tsn currently - * send, terminating the association and respond to the - * sender with an ABORT. - */ - if (!TSN_lt(ctsn, asoc->next_tsn)) - return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); - /* Return this SACK for further processing. */ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0b485952a71c..1b56fc440606 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -436,8 +436,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct net *net = sock_net(asoc->base.sk); - int retval = 0; + int retval = 0; /* If there is an outstanding ASCONF chunk, queue it for later * transmission. @@ -449,7 +448,7 @@ static int sctp_send_asconf(struct sctp_association *asoc, /* Hold the chunk until an ASCONF_ACK is received. */ sctp_chunk_hold(chunk); - retval = sctp_primitive_ASCONF(net, asoc, chunk); + retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); else @@ -2428,9 +2427,8 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, int error; if (params->spp_flags & SPP_HB_DEMAND && trans) { - struct net *net = sock_net(trans->asoc->base.sk); - - error = sctp_primitive_REQUESTHEARTBEAT(net, trans->asoc, trans); + error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net, + trans->asoc, trans); if (error) return error; } @@ -5364,7 +5362,7 @@ struct sctp_transport *sctp_transport_get_next(struct net *net, if (!sctp_transport_hold(t)) continue; - if (net_eq(sock_net(t->asoc->base.sk), net) && + if (net_eq(t->asoc->base.net, net) && t->asoc->peer.primary_path == t) break; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index c1a100d2fed3..67f7e71f9129 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -222,10 +222,9 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct net *net = sock_net(asoc->base.sk); int retval = 0; - retval = sctp_primitive_RECONF(net, asoc, chunk); + retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 40c40be23fcb..6b13f737ebf2 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -241,9 +241,8 @@ out: if (!first_frag) return NULL; - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, - last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { @@ -326,7 +325,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { @@ -337,8 +336,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( goto out; found: - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), - &ulpq->reasm, + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -630,7 +628,7 @@ out: if (!first_frag) return NULL; - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { @@ -716,7 +714,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { @@ -727,8 +725,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( goto out; found: - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), - &ulpq->reasm_uo, + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -814,7 +811,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) return NULL; out: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { @@ -921,7 +918,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) return NULL; out: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { @@ -1159,7 +1156,7 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); - SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 3bbe1a58ec87..806af58f4375 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -334,7 +334,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp); if (tp->rttvar || tp->srtt) { - struct net *net = sock_net(tp->asoc->base.sk); + struct net *net = tp->asoc->base.net; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index b6536b7f14c0..1c6c640607c5 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -486,10 +486,9 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul cevent = sctp_skb2event(pd_first); pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, - pd_first, - pd_last); + pd_first, pd_last); if (retval) sctp_ulpq_set_pd(ulpq); } @@ -497,7 +496,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul done: return retval; found: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -563,8 +562,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) * further. */ done: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; @@ -664,8 +663,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) * further. */ done: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); return retval; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e419ff277e55..2249de5379ee 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -41,7 +41,7 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; -static atomic_t lgr_cnt; /* number of existing link groups */ +static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -1297,7 +1297,6 @@ static struct notifier_block smc_reboot_notifier = { int __init smc_core_init(void) { - atomic_set(&lgr_cnt, 0); return register_reboot_notifier(&smc_reboot_notifier); } diff --git a/net/socket.c b/net/socket.c index 50623218747f..b79a05de7c6e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -129,6 +129,18 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +#ifdef CONFIG_PROC_FS +static void sock_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct socket *sock = f->private_data; + + if (sock->ops->show_fdinfo) + sock->ops->show_fdinfo(m, sock); +} +#else +#define sock_show_fdinfo NULL +#endif + /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. @@ -150,6 +162,7 @@ static const struct file_operations socket_file_ops = { .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, + .show_fdinfo = sock_show_fdinfo, }; /* diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 656ebc79c64e..4c20be08b9c4 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -568,18 +568,18 @@ int tipc_bclink_reset_stats(struct net *net) return 0; } -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; - if (limit < BCLINK_WIN_MIN) - limit = BCLINK_WIN_MIN; - if (limit > TIPC_MAX_LINK_WIN) + if (max_win < BCLINK_WIN_MIN) + max_win = BCLINK_WIN_MIN; + if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, limit); + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); tipc_bcast_unlock(net); return 0; } @@ -689,6 +689,7 @@ int tipc_bcast_init(struct net *net) if (!tipc_link_bc_create(net, 0, 0, FB_MTU, BCLINK_WIN_DEFAULT, + BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d7ec26bd739d..34ca7b789eba 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -311,7 +311,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->identity = bearer_id; b->tolerance = m->tolerance; - b->window = m->window; + b->min_win = m->min_win; + b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; @@ -796,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) @@ -1088,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; @@ -1142,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) @@ -1275,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index d0c79cc6c0c2..bc0023119da2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -119,7 +119,8 @@ struct tipc_media { char *raw); u32 priority; u32 tolerance; - u32 window; + u32 min_win; + u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; @@ -158,7 +159,8 @@ struct tipc_bearer { struct packet_type pt; struct rcu_head rcu; u32 priority; - u32 window; + u32 min_win; + u32 max_win; u32 tolerance; u32 domain; u32 identity; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index f69a2fde9f4a..8b0bb600602d 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -92,7 +92,8 @@ struct tipc_media eth_media_info = { .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index e8c16718e3fa..7aa9ff88458d 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,6 +42,8 @@ #include "core.h" #include "bearer.h" +#define TIPC_MAX_IB_LINK_WIN 500 + /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" diff --git a/net/tipc/link.c b/net/tipc/link.c index 24d4d10756d3..467c53a1fb5c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -164,7 +164,6 @@ struct tipc_link { struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; - u16 window; /* Reception */ u16 rcv_nxt; @@ -175,6 +174,12 @@ struct tipc_link { /* Congestion handling */ struct sk_buff_head wakeupq; + u16 window; + u16 min_win; + u16 ssthresh; + u16 max_win; + u16 cong_acks; + u16 checkpoint; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; @@ -244,12 +249,13 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); -static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); +static int tipc_link_release_pkts(struct tipc_link *l, u16 to); +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap); static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq); - +static void tipc_link_update_cwin(struct tipc_link *l, int released, + bool retransmitted); /* * Simple non-static link routines (i.e. referenced outside this file) */ @@ -308,9 +314,14 @@ u32 tipc_link_id(struct tipc_link *l) return l->peer_bearer_id << 16 | l->bearer_id; } -int tipc_link_window(struct tipc_link *l) +int tipc_link_min_win(struct tipc_link *l) { - return l->window; + return l->min_win; +} + +int tipc_link_max_win(struct tipc_link *l) +{ + return l->max_win; } int tipc_link_prio(struct tipc_link *l) @@ -436,7 +447,8 @@ u32 tipc_link_state(struct tipc_link *l) * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link - * @window: send window to be used by link + * @min_win: minimal send window to be used by link + * @max_win: maximal send window to be used by link * @session: session to be used by link * @ownnode: identity of own node * @peer: node id of peer node @@ -451,7 +463,7 @@ u32 tipc_link_state(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, + u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -495,7 +507,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; - tipc_link_set_queue_limits(l, window); + tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; @@ -523,7 +535,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * Returns true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -531,9 +543,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, { struct tipc_link *l; - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, - NULL, inputq, namedq, link)) + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, + max_win, 0, ownnode, peer, NULL, peer_caps, + bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; @@ -772,6 +784,8 @@ bool tipc_link_too_silent(struct tipc_link *l) return (l->silent_intv_cnt + 2 > l->abort_limit); } +static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq); /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -804,6 +818,11 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) probe |= l->silent_intv_cnt; if (probe || mstate->monitoring) l->silent_intv_cnt++; + if (l->snd_nxt == l->checkpoint) { + tipc_link_update_cwin(l, 0, 0); + probe = true; + } + l->checkpoint = l->snd_nxt; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; @@ -959,7 +978,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, int pkt_cnt = skb_queue_len(list); int imp = msg_importance(hdr); unsigned int mss = tipc_link_mss(l); - unsigned int maxwin = l->window; + unsigned int cwin = l->window; unsigned int mtu = l->mtu; bool new_bundle; int rc = 0; @@ -988,7 +1007,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, /* Prepare each packet for sending, and add to relevant queue: */ while ((skb = __skb_dequeue(list))) { - if (likely(skb_queue_len(transmq) < maxwin)) { + if (likely(skb_queue_len(transmq) < cwin)) { hdr = buf_msg(skb); msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); @@ -1035,17 +1054,61 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return rc; } +static void tipc_link_update_cwin(struct tipc_link *l, int released, + bool retransmitted) +{ + int bklog_len = skb_queue_len(&l->backlogq); + struct sk_buff_head *txq = &l->transmq; + int txq_len = skb_queue_len(txq); + u16 cwin = l->window; + + /* Enter fast recovery */ + if (unlikely(retransmitted)) { + l->ssthresh = max_t(u16, l->window / 2, 300); + l->window = l->ssthresh; + return; + } + /* Enter slow start */ + if (unlikely(!released)) { + l->ssthresh = max_t(u16, l->window / 2, 300); + l->window = l->min_win; + return; + } + /* Don't increase window if no pressure on the transmit queue */ + if (txq_len + bklog_len < cwin) + return; + + /* Don't increase window if there are holes the transmit queue */ + if (txq_len && l->snd_nxt - buf_seqno(skb_peek(txq)) != txq_len) + return; + + l->cong_acks += released; + + /* Slow start */ + if (cwin <= l->ssthresh) { + l->window = min_t(u16, cwin + released, l->max_win); + return; + } + /* Congestion avoidance */ + if (l->cong_acks < cwin) + return; + l->window = min_t(u16, ++cwin, l->max_win); + l->cong_acks = 0; +} + static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + struct sk_buff_head *txq = &l->transmq; struct sk_buff *skb, *_skb; - struct tipc_msg *hdr; - u16 seqno = l->snd_nxt; u16 ack = l->rcv_nxt - 1; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 seqno = l->snd_nxt; + struct tipc_msg *hdr; + u16 cwin = l->window; u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; @@ -1141,6 +1204,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; + int retransmitted = 0; struct tipc_msg *hdr; int rc = 0; @@ -1160,7 +1224,6 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; @@ -1173,11 +1236,12 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted++; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } + tipc_link_update_cwin(l, 0, retransmitted); return 0; } @@ -1338,9 +1402,9 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, return rc; } -static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) +static int tipc_link_release_pkts(struct tipc_link *l, u16 acked) { - bool released = false; + int released = 0; struct sk_buff *skb, *tmp; skb_queue_walk_safe(&l->transmq, skb, tmp) { @@ -1348,7 +1412,7 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) break; __skb_unlink(skb, &l->transmq); kfree_skb(skb); - released = true; + released++; } return released; } @@ -1359,14 +1423,14 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) * * returns the actual allocated memory size */ -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data) +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) { struct sk_buff *skb = skb_peek(&l->deferdq); struct tipc_gap_ack_blks *ga = data; u16 len, expect, seqno = 0; u8 n = 0; - if (!skb) + if (!skb || !gap) goto exit; expect = buf_seqno(skb); @@ -1417,8 +1481,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; bool passed = false; + u16 released = 0; u16 seqno, n = 0; int rc = 0; @@ -1430,6 +1496,7 @@ next_gap_ack: /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); + released++; } else if (less_eq(seqno, acked + gap)) { /* First, check if repeated retrans failures occurs? */ if (!passed && link_retransmit_failure(l, l, &rc)) @@ -1449,7 +1516,7 @@ next_gap_ack: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; @@ -1463,7 +1530,10 @@ next_gap_ack: goto next_gap_ack; } } - + if (released || retransmitted) + tipc_link_update_cwin(l, released, retransmitted); + if (released) + tipc_link_advance_backlog(l, xmitq); return 0; } @@ -1487,7 +1557,6 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) l->snd_nxt = l->rcv_nxt; return TIPC_LINK_SND_STATE; } - /* Unicast ACK */ l->rcv_unacked = 0; l->stats.sent_acks++; @@ -1521,7 +1590,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { u32 def_cnt = ++l->stats.deferred_recv; - u32 defq_len = skb_queue_len(&l->deferdq); + struct sk_buff_head *dfq = &l->deferdq; + u32 defq_len = skb_queue_len(dfq); int match1, match2; if (link_is_bc_rcvlink(l)) { @@ -1532,8 +1602,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, return 0; } - if (defq_len >= 3 && !((defq_len - 3) % 16)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); + if (defq_len >= 3 && !((defq_len - 3) % 16)) { + u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; + + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, + rcvgap, 0, 0, xmitq); + } return 0; } @@ -1548,6 +1622,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *defq = &l->deferdq; struct tipc_msg *hdr = buf_msg(skb); u16 seqno, rcv_nxt, win_lim; + int released = 0; int rc = 0; /* Verify and update link state */ @@ -1566,21 +1641,17 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, if (unlikely(!link_is_up(l))) { if (l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; - goto drop; + kfree_skb(skb); + break; } /* Drop if outside receive window */ if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { l->stats.duplicates++; - goto drop; - } - - /* Forward queues and wake up waiting users */ - if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { - tipc_link_advance_backlog(l, xmitq); - if (unlikely(!skb_queue_empty(&l->wakeupq))) - link_prepare_wakeup(l); + kfree_skb(skb); + break; } + released += tipc_link_release_pkts(l, msg_ack(hdr)); /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { @@ -1603,9 +1674,13 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, break; } while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt))); - return rc; -drop: - kfree_skb(skb); + /* Forward queues and wake up waiting users */ + if (released) { + tipc_link_update_cwin(l, released, 0); + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } return rc; } @@ -1631,7 +1706,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) return; - if (!skb_queue_empty(dfq)) + if ((probe || probe_reply) && !skb_queue_empty(dfq)) rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, @@ -1664,7 +1739,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) - glen = tipc_build_gap_ack_blks(l, data); + glen = tipc_build_gap_ack_blks(l, data, rcvgap); tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + glen + dlen); skb_trim(skb, INT_H_SIZE + glen + dlen); @@ -2074,19 +2149,18 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ - if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) + if ((reply || msg_is_keepalive(hdr)) && + more(peers_snd_nxt, rcv_nxt) && + !tipc_link_is_synching(l) && + skb_queue_empty(&l->deferdq)) rcvgap = peers_snd_nxt - l->rcv_nxt; if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); - - /* If NACK, retransmit will now start at right position */ if (gap) l->stats.recv_nacks++; - - tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } @@ -2305,15 +2379,18 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; } -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); - l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); + l->min_win = min_win; + l->ssthresh = max_win; + l->max_win = max_win; + l->window = min_win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } @@ -2366,10 +2443,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (max_win < TIPC_DEF_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } @@ -2605,7 +2682,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index c09e9d49d0a3..d3c1c3fc1659 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,7 +73,7 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, + u32 min_win, u32 max_win, u32 session, u32 ownnode, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link); bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); -int tipc_link_window(struct tipc_link *l); +int tipc_link_min_win(struct tipc_link *l); +int tipc_link_max_win(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); diff --git a/net/tipc/net.c b/net/tipc/net.c index 2de3cec9929d..85400e4242de 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -302,3 +302,59 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return err; } + +static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) +{ + struct tipc_net *tn = tipc_net(net); + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_ADDR_LEGACY_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + if (!attrs) + goto msg_full; + + if (tn->legacy_addr_format) + if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + struct sk_buff *rep; + int err; + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_addr_legacy_get(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} diff --git a/net/tipc/net.h b/net/tipc/net.h index b7f2e364eb99..6740d97c706e 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index e53231bd23b4..7c35094c20b8 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -83,6 +83,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 }, [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 }, [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 }, + [TIPC_NLA_NET_ADDR_LEGACY] = { .type = NLA_FLAG } }; const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { @@ -273,6 +274,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_node_flush_key, }, #endif + { + .cmd = TIPC_NL_ADDR_LEGACY_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_net_addr_legacy_get, + }, }; struct genl_family tipc_genl_family __ro_after_init = { diff --git a/net/tipc/node.c b/net/tipc/node.c index ab04e00cb95b..99b28b69fc17 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1139,7 +1139,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, snd_l = tipc_bc_sndlink(net); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, U16_MAX, - tipc_link_window(snd_l), + tipc_link_min_win(snd_l), + tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, @@ -1233,7 +1234,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, session, + b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -2360,8 +2361,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; - err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], - props); + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; @@ -2380,10 +2380,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, + tipc_link_min_win(link), + max_win); } } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ed113735c019..d6620ad53546 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -828,7 +828,8 @@ struct tipc_media udp_media_info = { .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index cd91ad812291..1ba5a92832bb 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -178,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ -static void tls_device_sk_destruct(struct sock *sk) +void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); @@ -196,6 +196,7 @@ static void tls_device_sk_destruct(struct sock *sk) if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); } +EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { @@ -903,7 +904,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_device_sk_destruct; + smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 774babbee045..6756a3ccc392 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -676,6 +676,16 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } +static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) +{ + struct sock *sk = sock->sk; + struct unix_sock *u; + + if (sk) { + u = unix_sk(sock->sk); + seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds)); + } +} static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, @@ -701,6 +711,7 @@ static const struct proto_ops unix_stream_ops = { .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { @@ -726,6 +737,7 @@ static const struct proto_ops unix_dgram_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { @@ -751,6 +763,7 @@ static const struct proto_ops unix_seqpacket_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static struct proto unix_proto = { @@ -788,6 +801,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); + memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1572,6 +1586,28 @@ static bool unix_skb_scm_eq(struct sk_buff *skb, unix_secdata_eq(scm, skb); } +static void scm_stat_add(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds += fp->count; +} + +static void scm_stat_del(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds -= fp->count; +} + /* * Send AF_UNIX data. */ @@ -1757,7 +1793,10 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); @@ -1859,7 +1898,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto pipe_err_free; maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2058,8 +2100,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, - &last); + skb = __skb_try_recv_datagram(sk, flags, scm_stat_del, + &skip, &err, &last); if (skb) break; @@ -2353,8 +2395,12 @@ unlock: sk_peek_offset_bwd(sk, chunk); - if (UNIXCB(skb).fp) + if (UNIXCB(skb).fp) { + spin_lock(&sk->sk_receive_queue.lock); + scm_stat_del(sk, skb); + spin_unlock(&sk->sk_receive_queue.lock); unix_detach_fds(&scm, skb); + } if (unix_skb_len(skb)) break; diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 8abcb815af2d..56356d2980c8 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -26,6 +26,18 @@ config VSOCKETS_DIAG Enable this module so userspace applications can query open sockets. +config VSOCKETS_LOOPBACK + tristate "Virtual Sockets loopback transport" + depends on VSOCKETS + default y + select VIRTIO_VSOCKETS_COMMON + help + This module implements a loopback transport for Virtual Sockets, + using vmw_vsock_virtio_transport_common. + + To compile this driver as a module, choose M here: the module + will be called vsock_loopback. If unsure, say N. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index 7c6f9a0b67b0..6a943ec95c4a 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o +obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 74db4cd637a7..9c5b2a91baad 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -136,6 +136,8 @@ static const struct vsock_transport *transport_h2g; static const struct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ static const struct vsock_transport *transport_dgram; +/* Transport used for local communication */ +static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); /**** UTILS ****/ @@ -386,6 +388,21 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected) } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); +static bool vsock_use_local_transport(unsigned int remote_cid) +{ + if (!transport_local) + return false; + + if (remote_cid == VMADDR_CID_LOCAL) + return true; + + if (transport_g2h) { + return remote_cid == transport_g2h->get_local_cid(); + } else { + return remote_cid == VMADDR_CID_HOST; + } +} + static void vsock_deassign_transport(struct vsock_sock *vsk) { if (!vsk->transport) @@ -402,9 +419,9 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) * (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: + * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if + * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST will use guest->host transport; - * - remote CID == local_cid (guest->host transport) will use guest->host - * transport for loopback (host->guest transports don't support loopback); * - remote CID > VMADDR_CID_HOST will use host->guest transport; */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) @@ -419,9 +436,9 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) new_transport = transport_dgram; break; case SOCK_STREAM: - if (remote_cid <= VMADDR_CID_HOST || - (transport_g2h && - remote_cid == transport_g2h->get_local_cid())) + if (vsock_use_local_transport(remote_cid)) + new_transport = transport_local; + else if (remote_cid <= VMADDR_CID_HOST) new_transport = transport_g2h; else new_transport = transport_h2g; @@ -464,6 +481,9 @@ bool vsock_find_cid(unsigned int cid) if (transport_h2g && cid == VMADDR_CID_HOST) return true; + if (transport_local && cid == VMADDR_CID_LOCAL) + return true; + return false; } EXPORT_SYMBOL_GPL(vsock_find_cid); @@ -2137,7 +2157,7 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport); int vsock_core_register(const struct vsock_transport *t, int features) { - const struct vsock_transport *t_h2g, *t_g2h, *t_dgram; + const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; int err = mutex_lock_interruptible(&vsock_register_mutex); if (err) @@ -2146,6 +2166,7 @@ int vsock_core_register(const struct vsock_transport *t, int features) t_h2g = transport_h2g; t_g2h = transport_g2h; t_dgram = transport_dgram; + t_local = transport_local; if (features & VSOCK_TRANSPORT_F_H2G) { if (t_h2g) { @@ -2171,9 +2192,18 @@ int vsock_core_register(const struct vsock_transport *t, int features) t_dgram = t; } + if (features & VSOCK_TRANSPORT_F_LOCAL) { + if (t_local) { + err = -EBUSY; + goto err_busy; + } + t_local = t; + } + transport_h2g = t_h2g; transport_g2h = t_g2h; transport_dgram = t_dgram; + transport_local = t_local; err_busy: mutex_unlock(&vsock_register_mutex); @@ -2194,6 +2224,9 @@ void vsock_core_unregister(const struct vsock_transport *t) if (transport_dgram == t) transport_dgram = NULL; + if (transport_local == t) + transport_local = NULL; + mutex_unlock(&vsock_register_mutex); } EXPORT_SYMBOL_GPL(vsock_core_unregister); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1458c5c8b64d..dfbaf6bd8b1c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -44,10 +44,6 @@ struct virtio_vsock { spinlock_t send_pkt_list_lock; struct list_head send_pkt_list; - struct work_struct loopback_work; - spinlock_t loopback_list_lock; /* protects loopback_list */ - struct list_head loopback_list; - atomic_t queued_replies; /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] @@ -86,20 +82,6 @@ out_rcu: return ret; } -static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock, - struct virtio_vsock_pkt *pkt) -{ - int len = pkt->len; - - spin_lock_bh(&vsock->loopback_list_lock); - list_add_tail(&pkt->list, &vsock->loopback_list); - spin_unlock_bh(&vsock->loopback_list_lock); - - queue_work(virtio_vsock_workqueue, &vsock->loopback_work); - - return len; -} - static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -194,7 +176,8 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) } if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { - len = virtio_transport_send_pkt_loopback(vsock, pkt); + virtio_transport_free_pkt(pkt); + len = -ENODEV; goto out_rcu; } @@ -502,33 +485,6 @@ static struct virtio_transport virtio_transport = { .send_pkt = virtio_transport_send_pkt, }; -static void virtio_transport_loopback_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, loopback_work); - LIST_HEAD(pkts); - - spin_lock_bh(&vsock->loopback_list_lock); - list_splice_init(&vsock->loopback_list, &pkts); - spin_unlock_bh(&vsock->loopback_list_lock); - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_recv_pkt(&virtio_transport, pkt); - } -out: - mutex_unlock(&vsock->rx_lock); -} - static void virtio_transport_rx_work(struct work_struct *work) { struct virtio_vsock *vsock = @@ -633,13 +589,10 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->event_lock); spin_lock_init(&vsock->send_pkt_list_lock); INIT_LIST_HEAD(&vsock->send_pkt_list); - spin_lock_init(&vsock->loopback_list_lock); - INIT_LIST_HEAD(&vsock->loopback_list); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); - INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work); mutex_lock(&vsock->tx_lock); vsock->tx_run = true; @@ -720,22 +673,12 @@ static void virtio_vsock_remove(struct virtio_device *vdev) } spin_unlock_bh(&vsock->send_pkt_list_lock); - spin_lock_bh(&vsock->loopback_list_lock); - while (!list_empty(&vsock->loopback_list)) { - pkt = list_first_entry(&vsock->loopback_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - spin_unlock_bh(&vsock->loopback_list_lock); - /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); /* Other works can be queued before 'config->del_vqs()', so we flush * all works before to free the vsock object to avoid use after free. */ - flush_work(&vsock->loopback_work); flush_work(&vsock->rx_work); flush_work(&vsock->tx_work); flush_work(&vsock->event_work); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 6abec3fc81d1..d9f0c9c5425a 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -11,9 +11,6 @@ #include <linux/sched/signal.h> #include <linux/ctype.h> #include <linux/list.h> -#include <linux/virtio.h> -#include <linux/virtio_ids.h> -#include <linux/virtio_config.h> #include <linux/virtio_vsock.h> #include <uapi/linux/vsockmon.h> diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 644d32e43d23..4b8b1150a738 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -648,7 +648,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) static bool vmci_transport_stream_allow(u32 cid, u32 port) { static const u32 non_socket_contexts[] = { - VMADDR_CID_RESERVED, + VMADDR_CID_LOCAL, }; int i; diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c new file mode 100644 index 000000000000..a45f7ffca8c5 --- /dev/null +++ b/net/vmw_vsock/vsock_loopback.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* loopback transport for vsock using virtio_transport_common APIs + * + * Copyright (C) 2013-2019 Red Hat, Inc. + * Authors: Asias He <asias@redhat.com> + * Stefan Hajnoczi <stefanha@redhat.com> + * Stefano Garzarella <sgarzare@redhat.com> + * + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/virtio_vsock.h> + +struct vsock_loopback { + struct workqueue_struct *workqueue; + + spinlock_t pkt_list_lock; /* protects pkt_list */ + struct list_head pkt_list; + struct work_struct pkt_work; +}; + +static struct vsock_loopback the_vsock_loopback; + +static u32 vsock_loopback_get_local_cid(void) +{ + return VMADDR_CID_LOCAL; +} + +static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + int len = pkt->len; + + spin_lock_bh(&vsock->pkt_list_lock); + list_add_tail(&pkt->list, &vsock->pkt_list); + spin_unlock_bh(&vsock->pkt_list_lock); + + queue_work(vsock->workqueue, &vsock->pkt_work); + + return len; +} + +static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + struct virtio_vsock_pkt *pkt, *n; + LIST_HEAD(freeme); + + spin_lock_bh(&vsock->pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + return 0; +} + +static struct virtio_transport loopback_transport = { + .transport = { + .module = THIS_MODULE, + + .get_local_cid = vsock_loopback_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + .cancel_pkt = vsock_loopback_cancel_pkt, + + .dgram_bind = virtio_transport_dgram_bind, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + .notify_buffer_size = virtio_transport_notify_buffer_size, + }, + + .send_pkt = vsock_loopback_send_pkt, +}; + +static void vsock_loopback_work(struct work_struct *work) +{ + struct vsock_loopback *vsock = + container_of(work, struct vsock_loopback, pkt_work); + LIST_HEAD(pkts); + + spin_lock_bh(&vsock->pkt_list_lock); + list_splice_init(&vsock->pkt_list, &pkts); + spin_unlock_bh(&vsock->pkt_list_lock); + + while (!list_empty(&pkts)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_recv_pkt(&loopback_transport, pkt); + } +} + +static int __init vsock_loopback_init(void) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + int ret; + + vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); + if (!vsock->workqueue) + return -ENOMEM; + + spin_lock_init(&vsock->pkt_list_lock); + INIT_LIST_HEAD(&vsock->pkt_list); + INIT_WORK(&vsock->pkt_work, vsock_loopback_work); + + ret = vsock_core_register(&loopback_transport.transport, + VSOCK_TRANSPORT_F_LOCAL); + if (ret) + goto out_wq; + + return 0; + +out_wq: + destroy_workqueue(vsock->workqueue); + return ret; +} + +static void __exit vsock_loopback_exit(void) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + struct virtio_vsock_pkt *pkt; + + vsock_core_unregister(&loopback_transport.transport); + + flush_work(&vsock->pkt_work); + + spin_lock_bh(&vsock->pkt_list_lock); + while (!list_empty(&vsock->pkt_list)) { + pkt = list_first_entry(&vsock->pkt_list, + struct virtio_vsock_pkt, list); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->pkt_list_lock); + + destroy_workqueue(vsock->workqueue); +} + +module_init(vsock_loopback_init); +module_exit(vsock_loopback_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>"); +MODULE_DESCRIPTION("loopback transport for vsock"); +MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index da5262b2298b..fa3526592c51 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12900,8 +12900,7 @@ static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd, return -EINVAL; } - return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy, - extack); + return nla_validate_nested(attr, vcmd->maxattr, vcmd->policy, extack); } static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 328f661b83b2..02ada7ab8c6e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,8 @@ #define TX_BATCH_SIZE 16 +static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); + bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && @@ -39,21 +41,21 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) { - return xskq_has_addrs(umem->fq, cnt); + return xskq_cons_has_entries(umem->fq, cnt); } EXPORT_SYMBOL(xsk_umem_has_addrs); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { - return xskq_peek_addr(umem->fq, addr, umem); + return xskq_cons_peek_addr(umem->fq, addr, umem); } EXPORT_SYMBOL(xsk_umem_peek_addr); -void xsk_umem_discard_addr(struct xdp_umem *umem) +void xsk_umem_release_addr(struct xdp_umem *umem) { - xskq_discard_addr(umem->fq); + xskq_cons_release(umem->fq); } -EXPORT_SYMBOL(xsk_umem_discard_addr); +EXPORT_SYMBOL(xsk_umem_release_addr); void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { @@ -124,7 +126,7 @@ static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, void *to_buf = xdp_umem_get_data(umem, addr); addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) { + if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; u64 page_start = addr & ~(PAGE_SIZE - 1); u64 first_len = PAGE_SIZE - (addr - page_start); @@ -146,7 +148,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) u32 metalen; int err; - if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; @@ -165,9 +167,9 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) offset += metalen; addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_produce_batch_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (!err) { - xskq_discard_addr(xs->umem->fq); + xskq_cons_release(xs->umem->fq); xdp_return_buff(xdp); return 0; } @@ -178,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); + int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); if (err) xs->rx_dropped++; @@ -214,7 +216,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { - xskq_produce_flush_desc(xs->rx); + xskq_prod_submit(xs->rx); xs->sk.sk_data_ready(&xs->sk); } @@ -234,7 +236,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) goto out_unlock; } - if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { err = -ENOSPC; goto out_drop; @@ -245,12 +247,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) memcpy(buffer, xdp->data_meta, len + metalen); addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_produce_batch_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) goto out_drop; - xskq_discard_addr(xs->umem->fq); - xskq_produce_flush_desc(xs->rx); + xskq_cons_release(xs->umem->fq); + xskq_prod_submit(xs->rx); spin_unlock_bh(&xs->rx_lock); @@ -264,11 +266,9 @@ out_unlock: return err; } -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; err = xsk_rcv(xs, xdp); @@ -281,10 +281,9 @@ int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, return 0; } -void __xsk_map_flush(struct bpf_map *map) +void __xsk_map_flush(void) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -295,7 +294,7 @@ void __xsk_map_flush(struct bpf_map *map) void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { - xskq_produce_flush_addr_n(umem->cq, nb_entries); + xskq_prod_submit_n(umem->cq, nb_entries); } EXPORT_SYMBOL(xsk_umem_complete_tx); @@ -317,13 +316,18 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; - if (xskq_produce_addr_lazy(umem->cq, desc->addr)) + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } @@ -358,7 +362,7 @@ static void xsk_destruct_skb(struct sk_buff *skb) unsigned long flags; spin_lock_irqsave(&xs->tx_completion_lock, flags); - WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + xskq_prod_submit_addr(xs->umem->cq, addr); spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); @@ -378,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - while (xskq_peek_desc(xs->tx, &desc, xs->umem)) { + while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { char *buffer; u64 addr; u32 len; @@ -399,7 +403,12 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ + if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { kfree_skb(skb); goto out; } @@ -411,7 +420,7 @@ static int xsk_generic_xmit(struct sock *sk) skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { /* SKB completed but not sent */ @@ -477,9 +486,9 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, __xsk_sendmsg(sk); } - if (xs->rx && !xskq_empty_desc(xs->rx)) + if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_full_desc(xs->tx)) + if (xs->tx && !xskq_cons_is_full(xs->tx)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; @@ -1183,7 +1192,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err; + int err, cpu; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -1201,6 +1210,8 @@ static int __init xsk_init(void) if (err) goto out_pernet; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); return 0; out_pernet: diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index b66504592d9b..c90e9c1e3c63 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -18,14 +18,14 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) q->chunk_mask = chunk_mask; } -static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); -} + struct xdp_umem_ring *umem_ring; + struct xdp_rxtx_ring *rxtx_ring; -static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) -{ - return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); + if (umem_queue) + return struct_size(umem_ring, desc, q->nentries); + return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) @@ -43,8 +43,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = umem_queue ? xskq_umem_get_ring_size(q) : - xskq_rxtx_get_ring_size(q); + size = xskq_get_ring_size(q, umem_queue); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index eddae4688862..bec2af11853a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -10,9 +10,6 @@ #include <linux/if_xdp.h> #include <net/xdp_sock.h> -#define RX_BATCH_SIZE 16 -#define LAZY_UPDATE_THRESHOLD 128 - struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; @@ -36,10 +33,8 @@ struct xsk_queue { u64 size; u32 ring_mask; u32 nentries; - u32 prod_head; - u32 prod_tail; - u32 cons_head; - u32 cons_tail; + u32 cached_prod; + u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; }; @@ -86,56 +81,31 @@ struct xsk_queue { * now and again after circling through the ring. */ -/* Common functions operating for both RXTX and umem queues */ - -static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) -{ - return q ? q->invalid_descs : 0; -} - -static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) -{ - u32 entries = q->prod_tail - q->cons_tail; - - if (entries == 0) { - /* Refresh the local pointer */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; - } - - return (entries > dcnt) ? dcnt : entries; -} - -static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) -{ - u32 free_entries = q->nentries - (producer - q->cons_tail); - - if (free_entries >= dcnt) - return free_entries; - - /* Refresh the local tail pointer */ - q->cons_tail = READ_ONCE(q->ring->consumer); - return q->nentries - (producer - q->cons_tail); -} - -static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) -{ - u32 entries = q->prod_tail - q->cons_tail; - - if (entries >= cnt) - return true; - - /* Refresh the local pointer. */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; - - return entries >= cnt; -} +/* The operations on the rings are the following: + * + * producer consumer + * + * RESERVE entries PEEK in the ring for entries + * WRITE data into the ring READ data from the ring + * SUBMIT entries RELEASE entries + * + * The producer reserves one or more entries in the ring. It can then + * fill in these entries and finally submit them so that they can be + * seen and read by the consumer. + * + * The consumer peeks into the ring to see if the producer has written + * any new entries. If so, the producer can then read these entries + * and when it is done reading them release them back to the producer + * so that the producer can use these slots to fill in new entries. + * + * The function names below reflect these operations. + */ -/* UMEM queue */ +/* Functions that read and validate content from consumer rings. */ -static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, - u64 length) +static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, + u64 addr, + u64 length) { bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; bool next_pg_contig = @@ -145,9 +115,16 @@ static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, return cross_pg && !next_pg_contig; } -static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) +static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, + u64 addr, + u64 length, + struct xdp_umem *umem) { - if (addr >= q->size) { + u64 base_addr = xsk_umem_extract_addr(addr); + + addr = xsk_umem_add_offset_to_addr(addr); + if (base_addr >= q->size || addr >= q->size || + xskq_cons_crosses_non_contig_pg(umem, addr, length)) { q->invalid_descs++; return false; } @@ -155,15 +132,9 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) return true; } -static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, - u64 length, - struct xdp_umem *umem) +static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) { - u64 base_addr = xsk_umem_extract_addr(addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->size || addr >= q->size || - xskq_crosses_non_contig_pg(umem, addr, length)) { + if (addr >= q->size) { q->invalid_descs++; return false; } @@ -171,204 +142,240 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, return true; } -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { - while (q->cons_tail != q->cons_head) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx] & q->chunk_mask; if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_is_valid_addr_unaligned(q, *addr, + if (xskq_cons_is_valid_unaligned(q, *addr, umem->chunk_size_nohr, umem)) - return addr; + return true; goto out; } - if (xskq_is_valid_addr(q, *addr)) - return addr; + if (xskq_cons_is_valid_addr(q, *addr)) + return true; out: - q->cons_tail++; + q->cached_cons++; } - return NULL; + return false; } -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { + if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) + return false; + + if (d->len > umem->chunk_size_nohr || d->options) { + q->invalid_descs++; + return false; + } - /* Order consumer and data */ - smp_rmb(); + return true; } - return xskq_validate_addr(q, addr, umem); -} + if (!xskq_cons_is_valid_addr(q, d->addr)) + return false; -static inline void xskq_discard_addr(struct xsk_queue *q) -{ - q->cons_tail++; + if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || + d->options) { + q->invalid_descs++; + return false; + } + + return true; } -static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) +static inline bool xskq_cons_read_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - if (xskq_nb_free(q, q->prod_tail, 1) == 0) - return -ENOSPC; + while (q->cached_cons != q->cached_prod) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = q->cached_cons & q->ring_mask; - /* A, matches D */ - ring->desc[q->prod_tail++ & q->ring_mask] = addr; + *desc = ring->desc[idx]; + if (xskq_cons_is_valid_desc(q, desc, umem)) + return true; - /* Order producer and data */ - smp_wmb(); /* B, matches C */ + q->cached_cons++; + } - WRITE_ONCE(q->ring->producer, q->prod_tail); - return 0; + return false; } -static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +/* Functions for consumers */ + +static inline void __xskq_cons_release(struct xsk_queue *q) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + smp_mb(); /* D, matches A */ + WRITE_ONCE(q->ring->consumer, q->cached_cons); +} - if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) - return -ENOSPC; +static inline void __xskq_cons_peek(struct xsk_queue *q) +{ + /* Refresh the local pointer */ + q->cached_prod = READ_ONCE(q->ring->producer); + smp_rmb(); /* C, matches B */ +} - /* A, matches D */ - ring->desc[q->prod_head++ & q->ring_mask] = addr; - return 0; +static inline void xskq_cons_get_entries(struct xsk_queue *q) +{ + __xskq_cons_release(q); + __xskq_cons_peek(q); } -static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, - u32 nb_entries) +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) { - /* Order producer and data */ - smp_wmb(); /* B, matches C */ + u32 entries = q->cached_prod - q->cached_cons; - q->prod_tail += nb_entries; - WRITE_ONCE(q->ring->producer, q->prod_tail); + if (entries >= cnt) + return true; + + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; + + return entries >= cnt; } -static inline int xskq_reserve_addr(struct xsk_queue *q) +static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { - if (xskq_nb_free(q, q->prod_head, 1) == 0) - return -ENOSPC; + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr(q, addr, umem); +} - /* A, matches D */ - q->prod_head++; - return 0; +static inline bool xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_desc(q, desc, umem); } -/* Rx/Tx queue */ +static inline void xskq_cons_release(struct xsk_queue *q) +{ + /* To improve performance, only update local state here. + * Reflect this to global state when we get new entries + * from the ring in xskq_cons_get_entries(). + */ + q->cached_cons++; +} -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, - struct xdp_umem *umem) +static inline bool xskq_cons_is_full(struct xsk_queue *q) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem)) - return false; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; +} - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } +/* Functions for producers */ - return true; - } +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (!xskq_is_valid_addr(q, d->addr)) + if (free_entries) return false; - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { - q->invalid_descs++; - return false; - } + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); - return true; + return !free_entries; } -static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline int xskq_prod_reserve(struct xsk_queue *q) { - while (q->cons_tail != q->cons_head) { - struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; - - *desc = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_desc(q, desc, umem)) - return desc; - - q->cons_tail++; - } + if (xskq_prod_is_full(q)) + return -ENOSPC; - return NULL; + /* A, matches D */ + q->cached_prod++; + return 0; } -static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); - - /* Order consumer and data */ - smp_rmb(); /* C, matches B */ - } + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - return xskq_validate_desc(q, desc, umem); -} + if (xskq_prod_is_full(q)) + return -ENOSPC; -static inline void xskq_discard_desc(struct xsk_queue *q) -{ - q->cons_tail++; + /* A, matches D */ + ring->desc[q->cached_prod++ & q->ring_mask] = addr; + return 0; } -static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u64 addr, u32 len) +static inline int xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx; + u32 idx; - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ - idx = (q->prod_head++) & q->ring_mask; + idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; return 0; } -static inline void xskq_produce_flush_desc(struct xsk_queue *q) +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail = q->prod_head; - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, idx); +} + +static inline void xskq_prod_submit(struct xsk_queue *q) +{ + __xskq_prod_submit(q, q->cached_prod); +} + +static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = q->ring->producer; + + ring->desc[idx++ & q->ring_mask] = addr; + + __xskq_prod_submit(q, idx); } -static inline bool xskq_full_desc(struct xsk_queue *q) +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { - return xskq_nb_avail(q, q->nentries) == q->nentries; + __xskq_prod_submit(q, q->ring->producer + nb_entries); } -static inline bool xskq_empty_desc(struct xsk_queue *q) +static inline bool xskq_prod_is_empty(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); +} + +/* For both producers and consumers */ + +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; } void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); |