diff options
58 files changed, 535 insertions, 436 deletions
diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 15073627c53a..21cc27e75f50 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -328,9 +328,6 @@ properties: snps,tx-sched-dwrr: type: boolean description: Deficit Weighted Round Robin - snps,tx-sched-sp: - type: boolean - description: Strict priority allOf: - if: required: @@ -339,7 +336,6 @@ properties: properties: snps,tx-sched-wfq: false snps,tx-sched-dwrr: false - snps,tx-sched-sp: false - if: required: - snps,tx-sched-wfq @@ -347,7 +343,6 @@ properties: properties: snps,tx-sched-wrr: false snps,tx-sched-dwrr: false - snps,tx-sched-sp: false - if: required: - snps,tx-sched-dwrr @@ -355,15 +350,6 @@ properties: properties: snps,tx-sched-wrr: false snps,tx-sched-wfq: false - snps,tx-sched-sp: false - - if: - required: - - snps,tx-sched-sp - then: - properties: - snps,tx-sched-wrr: false - snps,tx-sched-wfq: false - snps,tx-sched-dwrr: false patternProperties: "^queue[0-9]$": description: Each subnode represents a queue. diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f20dfe70fa0e..be0743dac3ff 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -348,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst, static bool has_gateway(const struct dst_entry *dst, sa_family_t family) { - const struct rtable *rt; - const struct rt6_info *rt6; + if (family == AF_INET) + return dst_rtable(dst)->rt_uses_gateway; - if (family == AF_INET) { - rt = container_of(dst, struct rtable, dst); - return rt->rt_uses_gateway; - } - - rt6 = dst_rt6_info(dst); - return rt6->rt6i_flags & RTF_GATEWAY; + return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY; } static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index 284f97876054..088ab8076db4 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -122,7 +122,7 @@ void prueth_xmit_free(struct prueth_tx_chn *tx_chn, } int emac_tx_complete_packets(struct prueth_emac *emac, int chn, - int budget) + int budget, bool *tdown) { struct net_device *ndev = emac->ndev; struct cppi5_host_desc_t *desc_tx; @@ -145,6 +145,7 @@ int emac_tx_complete_packets(struct prueth_emac *emac, int chn, if (cppi5_desc_is_tdcm(desc_dma)) { if (atomic_dec_and_test(&emac->tdown_cnt)) complete(&emac->tdown_complete); + *tdown = true; break; } @@ -190,19 +191,37 @@ int emac_tx_complete_packets(struct prueth_emac *emac, int chn, return num_tx; } +static enum hrtimer_restart emac_tx_timer_callback(struct hrtimer *timer) +{ + struct prueth_tx_chn *tx_chns = + container_of(timer, struct prueth_tx_chn, tx_hrtimer); + + enable_irq(tx_chns->irq); + return HRTIMER_NORESTART; +} + static int emac_napi_tx_poll(struct napi_struct *napi_tx, int budget) { struct prueth_tx_chn *tx_chn = prueth_napi_to_tx_chn(napi_tx); struct prueth_emac *emac = tx_chn->emac; + bool tdown = false; int num_tx_packets; - num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget); + num_tx_packets = emac_tx_complete_packets(emac, tx_chn->id, budget, + &tdown); if (num_tx_packets >= budget) return budget; - if (napi_complete_done(napi_tx, num_tx_packets)) - enable_irq(tx_chn->irq); + if (napi_complete_done(napi_tx, num_tx_packets)) { + if (unlikely(tx_chn->tx_pace_timeout_ns && !tdown)) { + hrtimer_start(&tx_chn->tx_hrtimer, + ns_to_ktime(tx_chn->tx_pace_timeout_ns), + HRTIMER_MODE_REL_PINNED); + } else { + enable_irq(tx_chn->irq); + } + } return num_tx_packets; } @@ -226,6 +245,9 @@ int prueth_ndev_add_tx_napi(struct prueth_emac *emac) struct prueth_tx_chn *tx_chn = &emac->tx_chns[i]; netif_napi_add_tx(emac->ndev, &tx_chn->napi_tx, emac_napi_tx_poll); + hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + tx_chn->tx_hrtimer.function = &emac_tx_timer_callback; ret = request_irq(tx_chn->irq, prueth_tx_irq, IRQF_TRIGGER_HIGH, tx_chn->name, tx_chn); @@ -871,8 +893,15 @@ int emac_napi_rx_poll(struct napi_struct *napi_rx, int budget) break; } - if (num_rx < budget && napi_complete_done(napi_rx, num_rx)) - enable_irq(emac->rx_chns.irq[rx_flow]); + if (num_rx < budget && napi_complete_done(napi_rx, num_rx)) { + if (unlikely(emac->rx_pace_timeout_ns)) { + hrtimer_start(&emac->rx_hrtimer, + ns_to_ktime(emac->rx_pace_timeout_ns), + HRTIMER_MODE_REL_PINNED); + } else { + enable_irq(emac->rx_chns.irq[rx_flow]); + } + } return num_rx; } diff --git a/drivers/net/ethernet/ti/icssg/icssg_ethtool.c b/drivers/net/ethernet/ti/icssg/icssg_ethtool.c index ca20325d4d3e..c8d0f45cc5b1 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_ethtool.c +++ b/drivers/net/ethernet/ti/icssg/icssg_ethtool.c @@ -201,6 +201,93 @@ static void emac_get_rmon_stats(struct net_device *ndev, rmon_stats->hist_tx[4] = emac_get_stat_by_name(emac, "tx_bucket5_frames"); } +static int emac_get_coalesce(struct net_device *ndev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct prueth_emac *emac = netdev_priv(ndev); + struct prueth_tx_chn *tx_chn; + + tx_chn = &emac->tx_chns[0]; + + coal->rx_coalesce_usecs = emac->rx_pace_timeout_ns / 1000; + coal->tx_coalesce_usecs = tx_chn->tx_pace_timeout_ns / 1000; + + return 0; +} + +static int emac_get_per_queue_coalesce(struct net_device *ndev, u32 queue, + struct ethtool_coalesce *coal) +{ + struct prueth_emac *emac = netdev_priv(ndev); + struct prueth_tx_chn *tx_chn; + + if (queue >= PRUETH_MAX_TX_QUEUES) + return -EINVAL; + + tx_chn = &emac->tx_chns[queue]; + + coal->tx_coalesce_usecs = tx_chn->tx_pace_timeout_ns / 1000; + + return 0; +} + +static int emac_set_coalesce(struct net_device *ndev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct prueth_emac *emac = netdev_priv(ndev); + struct prueth *prueth = emac->prueth; + struct prueth_tx_chn *tx_chn; + + tx_chn = &emac->tx_chns[0]; + + if (coal->rx_coalesce_usecs && + coal->rx_coalesce_usecs < ICSSG_MIN_COALESCE_USECS) { + dev_info(prueth->dev, "defaulting to min value of %dus for rx-usecs\n", + ICSSG_MIN_COALESCE_USECS); + coal->rx_coalesce_usecs = ICSSG_MIN_COALESCE_USECS; + } + + if (coal->tx_coalesce_usecs && + coal->tx_coalesce_usecs < ICSSG_MIN_COALESCE_USECS) { + dev_info(prueth->dev, "defaulting to min value of %dus for tx-usecs\n", + ICSSG_MIN_COALESCE_USECS); + coal->tx_coalesce_usecs = ICSSG_MIN_COALESCE_USECS; + } + + emac->rx_pace_timeout_ns = coal->rx_coalesce_usecs * 1000; + tx_chn->tx_pace_timeout_ns = coal->tx_coalesce_usecs * 1000; + + return 0; +} + +static int emac_set_per_queue_coalesce(struct net_device *ndev, u32 queue, + struct ethtool_coalesce *coal) +{ + struct prueth_emac *emac = netdev_priv(ndev); + struct prueth *prueth = emac->prueth; + struct prueth_tx_chn *tx_chn; + + if (queue >= PRUETH_MAX_TX_QUEUES) + return -EINVAL; + + tx_chn = &emac->tx_chns[queue]; + + if (coal->tx_coalesce_usecs && + coal->tx_coalesce_usecs < ICSSG_MIN_COALESCE_USECS) { + dev_info(prueth->dev, "defaulting to min value of %dus for tx-usecs for tx-%u\n", + ICSSG_MIN_COALESCE_USECS, queue); + coal->tx_coalesce_usecs = ICSSG_MIN_COALESCE_USECS; + } + + tx_chn->tx_pace_timeout_ns = coal->tx_coalesce_usecs * 1000; + + return 0; +} + const struct ethtool_ops icssg_ethtool_ops = { .get_drvinfo = emac_get_drvinfo, .get_msglevel = emac_get_msglevel, @@ -209,6 +296,12 @@ const struct ethtool_ops icssg_ethtool_ops = { .get_ethtool_stats = emac_get_ethtool_stats, .get_strings = emac_get_strings, .get_ts_info = emac_get_ts_info, + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | + ETHTOOL_COALESCE_TX_USECS, + .get_coalesce = emac_get_coalesce, + .set_coalesce = emac_set_coalesce, + .get_per_queue_coalesce = emac_get_per_queue_coalesce, + .set_per_queue_coalesce = emac_set_per_queue_coalesce, .get_channels = emac_get_channels, .set_channels = emac_set_channels, .get_link_ksettings = emac_get_link_ksettings, diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 186b0365c2e5..7c9e9518f555 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -243,6 +243,16 @@ static void emac_adjust_link(struct net_device *ndev) } } +static enum hrtimer_restart emac_rx_timer_callback(struct hrtimer *timer) +{ + struct prueth_emac *emac = + container_of(timer, struct prueth_emac, rx_hrtimer); + int rx_flow = PRUETH_RX_FLOW_DATA; + + enable_irq(emac->rx_chns.irq[rx_flow]); + return HRTIMER_NORESTART; +} + static int emac_phy_connect(struct prueth_emac *emac) { struct prueth *prueth = emac->prueth; @@ -582,8 +592,10 @@ static int emac_ndo_stop(struct net_device *ndev) netdev_err(ndev, "tx teardown timeout\n"); prueth_reset_tx_chan(emac, emac->tx_ch_num, true); - for (i = 0; i < emac->tx_ch_num; i++) + for (i = 0; i < emac->tx_ch_num; i++) { napi_disable(&emac->tx_chns[i].napi_tx); + hrtimer_cancel(&emac->tx_chns[i].tx_hrtimer); + } max_rx_flows = PRUETH_MAX_RX_FLOWS; k3_udma_glue_tdown_rx_chn(emac->rx_chns.rx_chn, true); @@ -591,6 +603,7 @@ static int emac_ndo_stop(struct net_device *ndev) prueth_reset_rx_chan(&emac->rx_chns, max_rx_flows, true); napi_disable(&emac->napi_rx); + hrtimer_cancel(&emac->rx_hrtimer); cancel_work_sync(&emac->rx_mode_work); @@ -801,6 +814,9 @@ static int prueth_netdev_init(struct prueth *prueth, ndev->features = ndev->hw_features; netif_napi_add(ndev, &emac->napi_rx, emac_napi_rx_poll); + hrtimer_init(&emac->rx_hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + emac->rx_hrtimer.function = &emac_rx_timer_callback; prueth->emac[mac] = emac; return 0; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index 82e38ef5635b..a78c5eb75fb8 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -108,6 +108,8 @@ struct prueth_tx_chn { u32 descs_num; unsigned int irq; char name[32]; + struct hrtimer tx_hrtimer; + unsigned long tx_pace_timeout_ns; }; struct prueth_rx_chn { @@ -127,6 +129,9 @@ struct prueth_rx_chn { #define PRUETH_MAX_TX_TS_REQUESTS 50 /* Max simultaneous TX_TS requests */ +/* Minimum coalesce time in usecs for both Tx and Rx */ +#define ICSSG_MIN_COALESCE_USECS 20 + /* data for each emac port */ struct prueth_emac { bool is_sr1; @@ -183,6 +188,10 @@ struct prueth_emac { struct delayed_work stats_work; u64 stats[ICSSG_NUM_STATS]; + + /* RX IRQ Coalescing Related */ + struct hrtimer rx_hrtimer; + unsigned long rx_pace_timeout_ns; }; /** @@ -320,7 +329,7 @@ void prueth_ndev_del_tx_napi(struct prueth_emac *emac, int num); void prueth_xmit_free(struct prueth_tx_chn *tx_chn, struct cppi5_host_desc_t *desc); int emac_tx_complete_packets(struct prueth_emac *emac, int chn, - int budget); + int budget, bool *tdown); int prueth_ndev_add_tx_napi(struct prueth_emac *emac); int prueth_init_tx_chns(struct prueth_emac *emac); int prueth_init_rx_chns(struct prueth_emac *emac, diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index f6eab66c2660..2b486e7c749c 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -141,9 +141,6 @@ static const struct ethtool_ops loopback_ethtool_ops = { static int loopback_dev_init(struct net_device *dev) { - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) - return -ENOMEM; netdev_lockdep_set_classes(dev); return 0; } @@ -151,7 +148,6 @@ static int loopback_dev_init(struct net_device *dev) static void loopback_dev_free(struct net_device *dev) { dev_net(dev)->loopback_dev = NULL; - free_percpu(dev->lstats); } static const struct net_device_ops loopback_ops = { @@ -191,6 +187,7 @@ static void gen_lo_setup(struct net_device *dev, dev->header_ops = hdr_ops; dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; dev->priv_destructor = dev_destructor; netif_set_tso_max_size(dev, GSO_MAX_SIZE); diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 784b9b2d275e..3a252ac5dd28 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -860,7 +860,7 @@ static int vrf_rt6_create(struct net_device *dev) static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 5f17a2a5d0e3..41fe8a043d61 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -970,9 +970,8 @@ static inline struct dst_entry *qeth_dst_check_rcu(struct sk_buff *skb, static inline __be32 qeth_next_hop_v4_rcu(struct sk_buff *skb, struct dst_entry *dst) { - struct rtable *rt = (struct rtable *) dst; - - return (rt) ? rt_nexthop(rt, ip_hdr(skb)->daddr) : ip_hdr(skb)->daddr; + return (dst) ? rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr) : + ip_hdr(skb)->daddr; } static inline struct in6_addr *qeth_next_hop_v6_rcu(struct sk_buff *skb, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f849e7d110ed..41853424b41d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); +void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f76825e5b92a..36b133f04d30 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -353,8 +353,6 @@ struct sk_buff; #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS -extern int sysctl_max_skb_frags; - /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ @@ -1180,15 +1178,6 @@ static inline bool skb_dst_is_noref(const struct sk_buff *skb) return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } -/** - * skb_rtable - Returns the skb &rtable - * @skb: buffer - */ -static inline struct rtable *skb_rtable(const struct sk_buff *skb) -{ - return (struct rtable *)skb_dst(skb); -} - /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. diff --git a/include/net/dsa.h b/include/net/dsa.h index a6ef7e4c503f..eef702dbea78 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -24,9 +24,6 @@ struct dsa_8021q_context; struct tc_action; -struct phy_device; -struct fixed_phy_status; -struct phylink_link_state; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 @@ -869,14 +866,6 @@ struct dsa_switch_ops { int regnum, u16 val); /* - * Link state adjustment (called from libphy) - */ - void (*adjust_link)(struct dsa_switch *ds, int port, - struct phy_device *phydev); - void (*fixed_link_update)(struct dsa_switch *ds, int port, - struct fixed_phy_status *st); - - /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 003667a1efd6..30e9570beb2a 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -38,6 +38,9 @@ struct net_hotdata { int max_backlog; int dev_tx_weight; int dev_rx_weight; + int sysctl_max_skb_frags; + int sysctl_skb_defer_max; + int sysctl_mem_pcpu_rsv; }; #define inet_ehash_secret net_hotdata.tcp_protocol.secret diff --git a/include/net/ip.h b/include/net/ip.h index 25cb688bdc62..6d735e00d3f3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -423,7 +423,7 @@ int ip_decrease_ttl(struct iphdr *iph) static inline int ip_mtu_locked(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; + const struct rtable *rt = dst_rtable(dst); return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } @@ -461,7 +461,7 @@ static inline bool ip_sk_ignore_df(const struct sock *sk) static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { - const struct rtable *rt = container_of(dst, struct rtable, dst); + const struct rtable *rt = dst_rtable(dst); struct net *net = dev_net(dst->dev); unsigned int mtu; diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h new file mode 100644 index 000000000000..a6ab2f4f5e28 --- /dev/null +++ b/include/net/proto_memory.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PROTO_MEMORY_H +#define _PROTO_MEMORY_H + +#include <net/sock.h> +#include <net/hotdata.h> + +/* 1 MB per cpu, in page units */ +#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) + +static inline bool sk_has_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure != NULL; +} + +static inline bool +proto_memory_pressure(const struct proto *prot) +{ + if (!prot->memory_pressure) + return false; + return !!READ_ONCE(*prot->memory_pressure); +} + +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return proto_memory_pressure(sk->sk_prot); +} + +static inline bool sk_under_memory_pressure(const struct sock *sk) +{ + if (!sk->sk_prot->memory_pressure) + return false; + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) + return true; + + return !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + +static inline long +proto_memory_allocated(const struct proto *prot) +{ + return max(0L, atomic_long_read(prot->memory_allocated)); +} + +static inline long +sk_memory_allocated(const struct sock *sk) +{ + return proto_memory_allocated(sk->sk_prot); +} + +static inline void proto_memory_pcpu_drain(struct proto *proto) +{ + int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); + + if (val) + atomic_long_add(val, proto->memory_allocated); +} + +static inline void +sk_memory_allocated_add(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +static inline void +sk_memory_allocated_sub(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +#endif /* _PROTO_MEMORY_H */ diff --git a/include/net/route.h b/include/net/route.h index 630d1ef6868a..93833cfe9c96 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -75,6 +75,17 @@ struct rtable { rt_pmtu:31; }; +#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst) + +/** + * skb_rtable - Returns the skb &rtable + * @skb: buffer + */ +static inline struct rtable *skb_rtable(const struct sk_buff *skb) +{ + return dst_rtable(skb_dst(skb)); +} + static inline bool rt_is_input_route(const struct rtable *rt) { return rt->rt_is_input != 0; diff --git a/include/net/sock.h b/include/net/sock.h index 48bcc845202f..0450494a1766 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1371,75 +1371,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk, #endif } -static inline bool sk_has_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure != NULL; -} - -static inline bool sk_under_global_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure && - !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline bool sk_under_memory_pressure(const struct sock *sk) -{ - if (!sk->sk_prot->memory_pressure) - return false; - - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) - return true; - - return !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline long -proto_memory_allocated(const struct proto *prot) -{ - return max(0L, atomic_long_read(prot->memory_allocated)); -} - -static inline long -sk_memory_allocated(const struct sock *sk) -{ - return proto_memory_allocated(sk->sk_prot); -} - -/* 1 MB per cpu, in page units */ -#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) -extern int sysctl_mem_pcpu_rsv; - -static inline void proto_memory_pcpu_drain(struct proto *proto) -{ - int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); - - if (val) - atomic_long_add(val, proto->memory_allocated); -} - -static inline void -sk_memory_allocated_add(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - -static inline void -sk_memory_allocated_sub(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) @@ -1466,15 +1397,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline bool -proto_memory_pressure(struct proto *prot) -{ - if (!prot->memory_pressure) - return false; - return !!READ_ONCE(*prot->memory_pressure); -} - - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { diff --git a/include/net/tcp.h b/include/net/tcp.h index fe98fb01879b..0a51e6a45bce 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -296,14 +296,6 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_out_of_memory(struct sock *sk) -{ - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; - return false; -} - static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); @@ -316,7 +308,7 @@ static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) void sk_forced_mem_schedule(struct sock *sk, int size); -bool tcp_check_oom(struct sock *sk, int shift); +bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; diff --git a/net/atm/clip.c b/net/atm/clip.c index 362e8d25a79e..42b910cb4e8e 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,7 +345,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, dev->stats.tx_dropped++; return NETDEV_TX_OK; } - rt = (struct rtable *) dst; + rt = dst_rtable(dst); if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else diff --git a/net/core/dev.c b/net/core/dev.c index c9e59eff8ec8..e02d2363347e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -940,6 +940,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) } EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} + /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace @@ -951,7 +963,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -960,12 +971,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } @@ -1217,7 +1227,10 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); + write_sequnlock(&netdev_rename_lock); + if (err < 0) { up_write(&devnet_rename_sem); return err; @@ -1257,7 +1270,9 @@ rollback: if (err >= 0) { err = ret; down_write(&devnet_rename_sem); + write_seqlock(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; @@ -4450,7 +4465,6 @@ EXPORT_SYMBOL(__dev_direct_xmit); *************************************************************************/ static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -11404,8 +11418,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - if (new_name[0]) /* Rename the netdev to prepared name */ + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); + } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); diff --git a/net/core/dev.h b/net/core/dev.h index 8572d2c8dc4a..b7b518bc2be5 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -36,7 +36,6 @@ int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ -extern unsigned int sysctl_skb_defer_max; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index b17171345d64..0c0bdb058c5b 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -83,7 +83,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) return NULL; *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); diff --git a/net/core/filter.c b/net/core/filter.c index 6d319c76188b..29165744c505 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2317,8 +2317,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { diff --git a/net/core/hotdata.c b/net/core/hotdata.c index c8a7a451c18a..d0aaaaa556f2 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <net/hotdata.h> #include <linux/cache.h> #include <linux/jiffies.h> #include <linux/list.h> - +#include <net/hotdata.h> +#include <net/proto_memory.h> struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), @@ -18,5 +18,8 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .max_backlog = 1000, .dev_tx_weight = 64, .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 64, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159f9..55bcacf67df3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -316,7 +316,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0c8b82750000..5f382e94b4d1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -109,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); - /* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page. @@ -6988,7 +6985,7 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); - defer_max = READ_ONCE(sysctl_skb_defer_max); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; @@ -7040,7 +7037,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { - size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; diff --git a/net/core/sock.c b/net/core/sock.c index fe9195186c13..8d6e638b5426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -127,6 +127,7 @@ #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -283,7 +284,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE; int sysctl_tstamp_allow_data __read_mostly = 1; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 903ab4a51c17..6da5995ac86a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include <net/busy_poll.h> #include <net/pkt_sched.h> #include <net/hotdata.h> +#include <net/proto_memory.h> #include <net/rps.h> #include "dev.h" @@ -415,7 +416,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "mem_pcpu_rsv", - .data = &sysctl_mem_pcpu_rsv, + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -595,7 +596,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -654,7 +655,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 2f347cd37316..12521a7d4048 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1511,8 +1511,7 @@ static int dsa_switch_probe(struct dsa_switch *ds) ds->ops->phylink_mac_config || ds->ops->phylink_mac_finish || ds->ops->phylink_mac_link_down || - ds->ops->phylink_mac_link_up || - ds->ops->adjust_link) + ds->ops->phylink_mac_link_up) return -EINVAL; } diff --git a/net/dsa/port.c b/net/dsa/port.c index c6febc3d96d9..9a249d4ac3a5 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1535,25 +1535,6 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } -static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) -{ - struct device_node *phy_dn; - struct phy_device *phydev; - - phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0); - if (!phy_dn) - return NULL; - - phydev = of_phy_find_device(phy_dn); - if (!phydev) { - of_node_put(phy_dn); - return ERR_PTR(-EPROBE_DEFER); - } - - of_node_put(phy_dn); - return phydev; -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) @@ -1616,17 +1597,10 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config, phy_interface_t interface) { struct dsa_port *dp = dsa_phylink_to_port(config); - struct phy_device *phydev = NULL; struct dsa_switch *ds = dp->ds; - if (dsa_port_is_user(dp)) - phydev = dp->user->phydev; - - if (!ds->ops->phylink_mac_link_down) { - if (ds->ops->adjust_link && phydev) - ds->ops->adjust_link(ds, dp->index, phydev); + if (!ds->ops->phylink_mac_link_down) return; - } ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } @@ -1641,11 +1615,8 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; - if (!ds->ops->phylink_mac_link_up) { - if (ds->ops->adjust_link && phydev) - ds->ops->adjust_link(ds, dp->index, phydev); + if (!ds->ops->phylink_mac_link_up) return; - } ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev, speed, duplex, tx_pause, rx_pause); @@ -1708,78 +1679,6 @@ void dsa_port_phylink_destroy(struct dsa_port *dp) dp->pl = NULL; } -static int dsa_shared_port_setup_phy_of(struct dsa_port *dp, bool enable) -{ - struct dsa_switch *ds = dp->ds; - struct phy_device *phydev; - int port = dp->index; - int err = 0; - - phydev = dsa_port_get_phy_device(dp); - if (!phydev) - return 0; - - if (IS_ERR(phydev)) - return PTR_ERR(phydev); - - if (enable) { - err = genphy_resume(phydev); - if (err < 0) - goto err_put_dev; - - err = genphy_read_status(phydev); - if (err < 0) - goto err_put_dev; - } else { - err = genphy_suspend(phydev); - if (err < 0) - goto err_put_dev; - } - - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); - - dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev)); - -err_put_dev: - put_device(&phydev->mdio.dev); - return err; -} - -static int dsa_shared_port_fixed_link_register_of(struct dsa_port *dp) -{ - struct device_node *dn = dp->dn; - struct dsa_switch *ds = dp->ds; - struct phy_device *phydev; - int port = dp->index; - phy_interface_t mode; - int err; - - err = of_phy_register_fixed_link(dn); - if (err) { - dev_err(ds->dev, - "failed to register the fixed PHY of port %d\n", - port); - return err; - } - - phydev = of_phy_find_device(dn); - - err = of_get_phy_mode(dn, &mode); - if (err) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; - - genphy_read_status(phydev); - - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); - - put_device(&phydev->mdio.dev); - - return 0; -} - static int dsa_shared_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; @@ -1983,44 +1882,28 @@ int dsa_shared_port_link_register_of(struct dsa_port *dp) dsa_switches_apply_workarounds)) return -EINVAL; - if (!ds->ops->adjust_link) { - if (missing_link_description) { - dev_warn(ds->dev, - "Skipping phylink registration for %s port %d\n", - dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); - } else { - dsa_shared_port_link_down(dp); + if (missing_link_description) { + dev_warn(ds->dev, + "Skipping phylink registration for %s port %d\n", + dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); + } else { + dsa_shared_port_link_down(dp); - return dsa_shared_port_phylink_register(dp); - } - return 0; + return dsa_shared_port_phylink_register(dp); } - dev_warn(ds->dev, - "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); - - if (of_phy_is_fixed_link(dp->dn)) - return dsa_shared_port_fixed_link_register_of(dp); - else - return dsa_shared_port_setup_phy_of(dp, true); + return 0; } void dsa_shared_port_link_unregister_of(struct dsa_port *dp) { - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->adjust_link && dp->pl) { + if (dp->pl) { rtnl_lock(); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); return; } - - if (of_phy_is_fixed_link(dp->dn)) - of_phy_deregister_fixed_link(dp->dn); - else - dsa_shared_port_setup_phy_of(dp, false); } int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a7cfeda28bb2..486a8d4f53b1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1307,8 +1307,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) int inet_sk_rebuild_header(struct sock *sk) { + struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ab82ca104496..11c1519b3699 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1003,6 +1003,55 @@ out_of_mem: * User level interface (ioctl) */ +static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, + bool getarp) +{ + struct net_device *dev; + + if (getarp) + dev = dev_get_by_name_rcu(net, r->arp_dev); + else + dev = __dev_get_by_name(net, r->arp_dev); + if (!dev) + return ERR_PTR(-ENODEV); + + /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ + if (!r->arp_ha.sa_family) + r->arp_ha.sa_family = dev->type; + + if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) + return ERR_PTR(-EINVAL); + + return dev; +} + +static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) +{ + struct net_device *dev; + struct rtable *rt; + __be32 ip; + + if (r->arp_dev[0]) + return arp_req_dev_by_name(net, r, false); + + if (r->arp_flags & ATF_PUBL) + return NULL; + + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + + rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); + if (IS_ERR(rt)) + return ERR_CAST(rt); + + dev = rt->dst.dev; + ip_rt_put(rt); + + if (!dev) + return ERR_PTR(-EINVAL); + + return dev; +} + /* * Set (create) an ARP cache entry. */ @@ -1023,11 +1072,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { - __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask && mask != htonl(0xFFFFFFFF)) - return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); @@ -1035,6 +1081,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -ENODEV; } if (mask) { + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; @@ -1043,30 +1091,20 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return arp_req_set_proxy(net, dev, 1); } -static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device *dev) +static int arp_req_set(struct net *net, struct arpreq *r) { - __be32 ip; struct neighbour *neigh; + struct net_device *dev; + __be32 ip; int err; + dev = arp_req_dev(net, r); + if (IS_ERR(dev)) + return PTR_ERR(dev); + if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); - ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (r->arp_flags & ATF_PERM) - r->arp_flags |= ATF_COM; - if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, 0, 0, - RT_SCOPE_LINK); - - if (IS_ERR(rt)) - return PTR_ERR(rt); - dev = rt->dst.dev; - ip_rt_put(rt); - if (!dev) - return -EINVAL; - } switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: @@ -1088,12 +1126,18 @@ static int arp_req_set(struct net *net, struct arpreq *r, break; } + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; - if (r->arp_flags & ATF_PERM) + + if (r->arp_flags & ATF_PERM) { + r->arp_flags |= ATF_COM; state = NUD_PERMANENT; + } + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | @@ -1117,27 +1161,40 @@ static unsigned int arp_state_to_flags(struct neighbour *neigh) * Get an ARP cache entry. */ -static int arp_req_get(struct arpreq *r, struct net_device *dev) +static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; - int err = -ENXIO; + struct net_device *dev; + + if (!r->arp_dev[0]) + return -ENODEV; + + dev = arp_req_dev_by_name(net, r, true); + if (IS_ERR(dev)) + return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); - if (neigh) { - if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) { - read_lock_bh(&neigh->lock); - memcpy(r->arp_ha.sa_data, neigh->ha, - min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); - r->arp_flags = arp_state_to_flags(neigh); - read_unlock_bh(&neigh->lock); - r->arp_ha.sa_family = dev->type; - strscpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); - err = 0; - } + if (!neigh) + return -ENXIO; + + if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); + return -ENXIO; } - return err; + + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, + min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + + neigh_release(neigh); + + r->arp_ha.sa_family = dev->type; + netdev_copy_name(dev, r->arp_dev); + + return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) @@ -1168,37 +1225,31 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { - __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask == htonl(0xFFFFFFFF)) - return pneigh_delete(&arp_tbl, net, &ip, dev); + if (mask) { + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (mask) - return -EINVAL; + return pneigh_delete(&arp_tbl, net, &ip, dev); + } return arp_req_set_proxy(net, dev, 0); } -static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device *dev) +static int arp_req_delete(struct net *net, struct arpreq *r) { + struct net_device *dev; __be32 ip; + dev = arp_req_dev(net, r); + if (IS_ERR(dev)) + return PTR_ERR(dev); + if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, 0, 0, - RT_SCOPE_LINK); - if (IS_ERR(rt)) - return PTR_ERR(rt); - dev = rt->dst.dev; - ip_rt_put(rt); - if (!dev) - return -EINVAL; - } + return arp_invalidate(dev, ip, true); } @@ -1208,9 +1259,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r, int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { - int err; struct arpreq r; - struct net_device *dev = NULL; + __be32 *netmask; + int err; switch (cmd) { case SIOCDARP: @@ -1233,42 +1284,34 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; + + netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) - ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = - htonl(0xFFFFFFFFUL); - rtnl_lock(); - if (r.arp_dev[0]) { - err = -ENODEV; - dev = __dev_get_by_name(net, r.arp_dev); - if (!dev) - goto out; - - /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ - if (!r.arp_ha.sa_family) - r.arp_ha.sa_family = dev->type; - err = -EINVAL; - if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) - goto out; - } else if (cmd == SIOCGARP) { - err = -ENODEV; - goto out; - } + *netmask = htonl(0xFFFFFFFFUL); + else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) + return -EINVAL; switch (cmd) { case SIOCDARP: - err = arp_req_delete(net, &r, dev); + rtnl_lock(); + err = arp_req_delete(net, &r); + rtnl_unlock(); break; case SIOCSARP: - err = arp_req_set(net, &r, dev); + rtnl_lock(); + err = arp_req_set(net, &r); + rtnl_unlock(); break; case SIOCGARP: - err = arp_req_get(&r, dev); + rcu_read_lock(); + err = arp_req_get(net, &r); + rcu_read_unlock(); + + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; break; } -out: - rtnl_unlock(); - if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) - err = -EFAULT; + return err; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 437e782b9663..207482d30dc7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -483,6 +483,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct icmp_bxm *param) { struct net_device *route_lookup_dev; + struct dst_entry *dst, *dst2; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -508,16 +509,17 @@ static struct rtable *icmp_route_lookup(struct net *net, /* No need to clone since we're just using its address. */ rt2 = rt; - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(fl4), NULL, 0); - if (!IS_ERR(rt)) { + dst = xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + rt = dst_rtable(dst); + if (!IS_ERR(dst)) { if (rt != rt2) return rt; - } else if (PTR_ERR(rt) == -EPERM) { + } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; - } else + } else { return rt; - + } err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; @@ -551,19 +553,19 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(&fl4_dec), NULL, - XFRM_LOOKUP_ICMP); - if (!IS_ERR(rt2)) { + dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + rt2 = dst_rtable(dst2); + if (!IS_ERR(dst2)) { dst_release(&rt->dst); memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; - } else if (PTR_ERR(rt2) == -EPERM) { + } else if (PTR_ERR(dst2) == -EPERM) { if (rt) dst_release(&rt->dst); return rt2; } else { - err = PTR_ERR(rt2); + err = PTR_ERR(dst2); goto relookup_failed; } return rt; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 5e9c8156656a..d6fbcbd2358a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -616,7 +616,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, - ((struct rtable *)dst)->rt_type); + dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1fe794967211..b455bd05a7d5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; @@ -475,7 +475,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, goto packet_routed; /* Make sure we can route this packet. */ - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { __be32 daddr; @@ -971,7 +971,7 @@ static int __ip_append_data(struct sock *sk, bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; @@ -1390,7 +1390,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 914bc9c35cc7..6c4664c681ca 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -33,6 +33,7 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f89ff2e5a05b..0fd9a3d7ac4a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -819,7 +819,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf u32 mark = skb->mark; __u8 tos = iph->tos; - rt = (struct rtable *) dst; + rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); @@ -827,7 +827,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct dst_entry *ret = dst; if (rt) { @@ -1044,7 +1044,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); @@ -1115,7 +1115,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); - rt = (struct rtable *)odst; + rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) @@ -1124,7 +1124,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1181,7 +1181,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -1516,10 +1516,8 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - ip_dst_metrics_put(dst); - rt_del_uncached_list(rt); + rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) @@ -2820,7 +2818,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); @@ -2865,9 +2863,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; - rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0)); } return rt; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4ec0f4feee00..e1f0efbb29d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,6 +272,7 @@ #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> @@ -280,6 +281,7 @@ #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> +#include <net/hotdata.h> #include <net/rps.h> /* Track pending CMSGs. */ @@ -1188,7 +1190,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= READ_ONCE(sysctl_max_skb_frags)) { + if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } @@ -2751,7 +2753,15 @@ static bool tcp_too_many_orphans(int shift) READ_ONCE(sysctl_tcp_max_orphans); } -bool tcp_check_oom(struct sock *sk, int shift) +static bool tcp_out_of_memory(const struct sock *sk) +{ + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) + return true; + return false; +} + +bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53e1150f706f..ad8fa129fcfe 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include <linux/prefetch.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ea7ad7d99245..57edf66ff91b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,7 @@ #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6e2446295089..fe55ff5d379b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1217,7 +1217,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (connected) - rt = (struct rtable *)sk_dst_check(sk, 0); + rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1dda59e0aeab..fccbbd3e1a4b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -69,7 +69,7 @@ static int xfrm4_get_saddr(struct net *net, int oif, static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rtable *rt = (struct rtable *)xdst->route; + struct rtable *rt = dst_rtable(xdst->route); const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0f2506e35359..0627c4c18d1a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -252,9 +252,8 @@ static void aca_free_rcu(struct rcu_head *h) static void aca_put(struct ifacaddr6 *ac) { - if (refcount_dec_and_test(&ac->aca_refcnt)) { - call_rcu(&ac->rcu, aca_free_rcu); - } + if (refcount_dec_and_test(&ac->aca_refcnt)) + call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 970af3983d11..19c8cc5289d5 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -459,7 +459,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &inet->cork.fl.u.ip4; if (connected) - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 606349c8df0e..4385fd3b13be 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -81,7 +81,7 @@ static int mpls_xmit(struct sk_buff *skb) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; + rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4b13ca362efa..aff17597e6a7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -20,6 +20,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> +#include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" @@ -1272,7 +1273,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { + if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 6e8b9d100ad2..3313bceb6cc9 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -318,7 +318,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 100887beed31..c2c005234dcd 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -434,7 +434,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; @@ -446,7 +446,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 2434c624aafd..14d88394bcb7 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -73,7 +73,7 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 60239378d43f..6292d6d73b72 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1389,6 +1389,7 @@ err_out4: ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: + lockdep_unregister_key(&sch->root_lock_key); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 31dfd6c7405b..d3f6006b563c 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -982,6 +982,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, return sch; errout1: + lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e849f368ed91..5a7436a13b74 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -552,7 +552,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; - struct rtable *rt = (struct rtable *)t->dst; + struct rtable *rt = dst_rtable(t->dst); if (rt) { saddr->v4.sin_family = AF_INET; @@ -1085,7 +1085,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); - udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, + udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false); return 0; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 08fdf1251f46..5adf0c0a6c1a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -38,6 +38,7 @@ #include <linux/inet.h> #include <linux/slab.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <net/inet_ecn.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index f892b0903dba..b849a3d133a0 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -174,7 +174,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { - struct rtable *rt = (struct rtable *)ndst; + struct rtable *rt = dst_rtable(ndst); if (!rt) { struct flowi4 fl = { diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index 72c6001964a6..e9a6c702b8c9 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -28,6 +28,8 @@ TEST_PROGS += nft_zones_many.sh TEST_PROGS += rpath.sh TEST_PROGS += xt_string.sh +TEST_PROGS_EXTENDED = nft_concat_range_perf.sh + TEST_GEN_PROGS = conntrack_dump_flush TEST_GEN_FILES = audit_logread diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 60b86c7f3ea1..5b5b764f6cd0 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -85,3 +85,4 @@ CONFIG_VETH=m CONFIG_VLAN_8021Q=m CONFIG_XFRM_USER=m CONFIG_XFRM_STATISTICS=y +CONFIG_NET_PKTGEN=m diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 2b6661519055..6d66240e149c 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -19,7 +19,7 @@ source lib.sh # - timeout: check that packets match entries until they expire # - performance: estimate matching rate, compare with rbtree and hash baselines TESTS="reported_issues correctness concurrency timeout" -[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance" +[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}" # Set types, defined by TYPE_ variables below TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto @@ -31,7 +31,7 @@ BUGS="flush_remove_add reload" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" - ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh + ../../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh pktgen/pktgen_bench_xmit_mode_netif_receive.sh" # Definition of set types: @@ -951,6 +951,10 @@ cleanup() { killall iperf 2>/dev/null killall netperf 2>/dev/null killall netserver 2>/dev/null +} + +cleanup_exit() { + cleanup rm -f "$tmp" } @@ -1371,6 +1375,9 @@ test_timeout() { setup veth send_"${proto}" set || return ${ksft_skip} timeout=3 + + [ "$KSFT_MACHINE_SLOW" = "yes" ] && timeout=8 + range_size=1 for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) @@ -1386,7 +1393,7 @@ test_timeout() { range_size=$((range_size + 1)) start=$((end + range_size)) done - sleep 3 + sleep $timeout for i in $(seq "$start" $((start + count))); do end=$((start + range_size)) srcstart=$((start + src_delta)) @@ -1480,10 +1487,13 @@ test_performance() { } test_bug_flush_remove_add() { + rounds=100 + [ "$KSFT_MACHINE_SLOW" = "yes" ] && rounds=10 + set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }' elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' - for i in $(seq 1 100); do + for i in $(seq 1 $rounds); do nft add table t "$set_cmd" || return ${ksft_skip} nft add element t s "$elem1" 2>/dev/null || return 1 nft flush set t s 2>/dev/null || return 1 @@ -1552,7 +1562,7 @@ test_reported_issues() { # Run everything in a separate network namespace [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; } tmp="$(mktemp)" -trap cleanup EXIT +trap cleanup_exit EXIT # Entry point for test runs passed=0 @@ -1584,10 +1594,16 @@ for name in ${TESTS}; do continue fi - printf " %-60s " "${display}" + [ "$KSFT_MACHINE_SLOW" = "yes" ] && count=1 + + printf " %-32s " "${display}" + tthen=$(date +%s) eval test_"${name}" ret=$? + tnow=$(date +%s) + printf "%5ds%-30s" $((tnow-tthen)) + if [ $ret -eq 0 ]; then printf "[ OK ]\n" info_flush diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh new file mode 100755 index 000000000000..5d276995a5c5 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +source lib.sh + +[ "$KSFT_MACHINE_SLOW" = yes ] && exit ${ksft_skip} + +NFT_CONCAT_RANGE_TESTS="performance" exec ./nft_concat_range.sh |