From a7c4a547ea1de2e9d6cb82263962aaff3ba67ff5 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 25 Nov 2019 09:06:16 +0100 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index c7b340ddd0e7..fd8c0728ddc7 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.5" +#define BATADV_SOURCE_VERSION "2020.0" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From f4191c6d80b92caf21d779429c51dd52e2a16d70 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 14 Nov 2019 22:12:37 +0100 Subject: batman-adv: Strip dots from variable macro kerneldoc The commit 43756e347f21 ("scripts/kernel-doc: Add support for named variable macro arguments") changed the handling of variable macro parameters. The three dots of the argument must no longer be added to the kernel doc. The support for the old format is scheduled to be removed in the future. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/log.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 741cfa3719ff..41fd900121c5 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -74,7 +74,7 @@ __printf(2, 3); * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ do { \ @@ -98,7 +98,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_dbg() - Store debug output without ratelimiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information - * @arg...: format string and variable arguments + * @arg: format string and variable arguments */ #define batadv_dbg(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 0, ## arg) @@ -107,7 +107,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_dbg_ratelimited() - Store debug output with ratelimiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information - * @arg...: format string and variable arguments + * @arg: format string and variable arguments */ #define batadv_dbg_ratelimited(type, bat_priv, arg...) \ _batadv_dbg(type, bat_priv, 1, ## arg) @@ -116,7 +116,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_info() - Store message in debug buffer and print it to kmsg buffer * @net_dev: the soft interface net device * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define batadv_info(net_dev, fmt, arg...) \ do { \ @@ -130,7 +130,7 @@ static inline void _batadv_dbg(int type __always_unused, * batadv_err() - Store error in debug buffer and print it to kmsg buffer * @net_dev: the soft interface net device * @fmt: format string - * @arg...: variable arguments + * @arg: variable arguments */ #define batadv_err(net_dev, fmt, arg...) \ do { \ -- cgit v1.2.3 From 69fed4ce2e9a1686894bc23e1523a7ca1a8c74f6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 16 Nov 2019 08:28:58 +0100 Subject: batman-adv: Fix typo metAdata Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 47718a82eaf2..bdf9827b5f63 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -457,7 +457,7 @@ struct batadv_orig_node { /** * @tt_lock: prevents from updating the table while reading it. Table * update is made up by two operations (data structure update and - * metdata -CRC/TTVN-recalculation) and they have to be executed + * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. */ @@ -1011,7 +1011,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading * the local table. The local TT commit is made up by two operations - * (data structure update and metdata -CRC/TTVN- recalculation) and + * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. */ -- cgit v1.2.3 From 2b1aa5a4c654cc665eb5711f49a2855ea6ebe0db Mon Sep 17 00:00:00 2001 From: RenĂ© Treffer Date: Tue, 26 Nov 2019 10:27:38 +0800 Subject: batman-adv: ELP - use wifi tx bitrate as fallback throughput MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some wifi drivers (e.g. ath10k) provide per-station rx/tx values but no estimated throughput. Setting a better estimate than the default 1 MBit makes these devices work well with B.A.T.M.A.N. V. Signed-off-by: RenĂ© Treffer Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 2614a9caee00..6536e8cc53a0 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -107,10 +107,17 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) } if (ret) goto default_throughput; - if (!(sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT))) - goto default_throughput; - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) + return sinfo.expected_throughput / 100; + + /* try to estimate the expected throughput based on reported tx + * rates + */ + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) + return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + + goto default_throughput; } /* if not a wifi interface, check if this device provides data via -- cgit v1.2.3 From 61a292860de6ab66c9b6aa1b244ccd4590ddc8ec Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 28 Nov 2019 11:26:06 +0100 Subject: batman-adv: Annotate bitwise integer pointer casts The sparse commit 6002ded74587 ("add a flag to warn on casts to/from bitwise pointers") introduced a check for non-direct casts from/to restricted datatypes (when -Wbitwise-pointer is enabled). This triggered various warnings in batman-adv when some (already big endian) buffer content was casted to/from the corresponding big endian integer data types. But these were correct and can therefore be marked with __force to signalize sparse an intended cast from/to a bitwise type. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/distributed-arp-table.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 663a53b6d36e..0eff33580f95 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -844,7 +844,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; - crc = ntohs(*((__be16 *)(&an_addr[4]))); + crc = ntohs(*((__force __be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b0af3a11d406..5004e38fe792 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -246,7 +246,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { - return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); + return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN); } /** @@ -270,7 +270,9 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { - return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4); + u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4; + + return *(__force __be32 *)dst; } /** @@ -287,7 +289,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) const unsigned char *key; u32 i; - key = (const unsigned char *)&dat->ip; + key = (__force const unsigned char *)&dat->ip; for (i = 0; i < sizeof(dat->ip); i++) { hash += key[i]; hash += (hash << 10); -- cgit v1.2.3 From b50b0580d27bc45a0637aefc8bac4d31aa85771a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:48:57 +0100 Subject: net: add queue argument to __skb_wait_for_more_packets and __skb_{,try_}recv_datagram This will be used by ESP over TCP to handle the queue of IKE messages. Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/linux/skbuff.h | 11 ++++++++--- net/core/datagram.c | 27 +++++++++++++++++---------- net/ipv4/udp.c | 3 ++- net/unix/af_unix.c | 7 ++++--- 4 files changed, 31 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e9133bcf0544..49a10f9cc538 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3459,7 +3459,8 @@ static inline void skb_frag_list_init(struct sk_buff *skb) for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) -int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, const struct sk_buff *skb); struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, @@ -3468,12 +3469,16 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last); -struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags, +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last); -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err); diff --git a/net/core/datagram.c b/net/core/datagram.c index da3c24ed129c..a78e7f864c1e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -84,7 +84,8 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i /* * Wait for the last received packet to be different from skb */ -int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, const struct sk_buff *skb) { int error; @@ -97,7 +98,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, if (error) goto out_err; - if (READ_ONCE(sk->sk_receive_queue.prev) != skb) + if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ @@ -209,6 +210,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket + * @queue: socket queue from which to receive * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue * @off: an offset in bytes to peek skb from. Returns an offset @@ -241,13 +243,14 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err, struct sk_buff **last) { - struct sk_buff_head *queue = &sk->sk_receive_queue; struct sk_buff *skb; unsigned long cpu_flags; /* @@ -278,7 +281,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (READ_ONCE(sk->sk_receive_queue.prev) != *last); + } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN; @@ -288,7 +291,9 @@ no_packet: } EXPORT_SYMBOL(__skb_try_recv_datagram); -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), int *off, int *err) @@ -299,15 +304,16 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, destructor, off, err, - &last); + skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor, + off, err, &last); if (skb) return skb; if (*err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, err, &timeo, last)); + !__skb_wait_for_more_packets(sk, sk_queue, err, + &timeo, last)); return NULL; } @@ -318,7 +324,8 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, { int off = 0; - return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + return __skb_recv_datagram(sk, &sk->sk_receive_queue, + flags | (noblock ? MSG_DONTWAIT : 0), NULL, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4da5758cc718..e5738d1217a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1708,7 +1708,8 @@ busy_check: /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && - !__skb_wait_for_more_packets(sk, &error, &timeo, + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7cfdce10de36..a7f707fc4cac 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2058,8 +2058,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, - &last); + skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, + NULL, &skip, &err, &last); if (skb) break; @@ -2068,7 +2068,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (err != -EAGAIN) break; } while (timeo && - !__skb_wait_for_more_packets(sk, &err, &timeo, last)); + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); -- cgit v1.2.3 From 7b3801927e52f8621de311277f7fc727635019e7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:48:58 +0100 Subject: xfrm: introduce xfrm_trans_queue_net This will be used by TCP encapsulation to write packets to the encap socket without holding the user socket's lock. Without this reinjection, we're already holding the lock of the user socket, and then try to lock the encap socket as well when we enqueue the encrypted packet. While at it, add a BUILD_BUG_ON like we usually do for skb->cb, since it's missing for struct xfrm_trans_cb. Co-developed-by: Herbert Xu Signed-off-by: Herbert Xu Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +++ net/xfrm/xfrm_input.c | 21 +++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dda3c025452e..56ff86621bb4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1547,6 +1547,9 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 2c86a2fc3915..aa35f23c4912 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -36,6 +36,7 @@ struct xfrm_trans_cb { #endif } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); + struct net *net; }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) @@ -766,12 +767,13 @@ static void xfrm_trans_reinject(unsigned long data) skb_queue_splice_init(&trans->queue, &queue); while ((skb = __skb_dequeue(&queue))) - XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb); + XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, + NULL, skb); } -int xfrm_trans_queue(struct sk_buff *skb, - int (*finish)(struct net *, struct sock *, - struct sk_buff *)) +int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) { struct xfrm_trans_tasklet *trans; @@ -780,11 +782,22 @@ int xfrm_trans_queue(struct sk_buff *skb, if (skb_queue_len(&trans->queue) >= netdev_max_backlog) return -ENOBUFS; + BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); + XFRM_TRANS_SKB_CB(skb)->finish = finish; + XFRM_TRANS_SKB_CB(skb)->net = net; __skb_queue_tail(&trans->queue, skb); tasklet_schedule(&trans->tasklet); return 0; } +EXPORT_SYMBOL(xfrm_trans_queue_net); + +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)) +{ + return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish); +} EXPORT_SYMBOL(xfrm_trans_queue); void __init xfrm_input_init(void) -- cgit v1.2.3 From cac3c71604cf4eada8df00b1e66892636399cda5 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:48:59 +0100 Subject: xfrm: add route lookup to xfrm4_rcv_encap At this point, with TCP encapsulation, the dst may be gone, but xfrm_input needs one. Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_protocol.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8a4285712808..ea595c8549c7 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -72,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, if (!head) goto out; + if (!skb_dst(skb)) { + const struct iphdr *iph = ip_hdr(skb); + + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + goto drop; + } + for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; @@ -79,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: kfree_skb(skb); return 0; } -- cgit v1.2.3 From 25f6802b4c18817c82cd581d38ce155ad6412176 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:49:00 +0100 Subject: esp4: prepare esp_input_done2 for non-UDP encapsulation For espintcp encapsulation, we will need to get the source port from the TCP header instead of UDP. Introduce a variable to hold the port. Co-developed-by: Herbert Xu Signed-off-by: Herbert Xu Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5c967764041f..c5d826642229 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -601,6 +601,18 @@ int esp_input_done2(struct sk_buff *skb, int err) if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); + __be16 source; + + switch (x->encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } /* * 1) if the NAT-T peer's IP or port changed then @@ -609,11 +621,11 @@ int esp_input_done2(struct sk_buff *skb, int err) * SRC ports. */ if (iph->saddr != x->props.saddr.a4 || - uh->source != encap->encap_sport) { + source != encap->encap_sport) { xfrm_address_t ipaddr; ipaddr.a4 = iph->saddr; - km_new_mapping(x, &ipaddr, uh->source); + km_new_mapping(x, &ipaddr, source); /* XXX: perhaps add an extra * policy check here, to see -- cgit v1.2.3 From eecd227a9a3479038ba2a2f579b3ce9edb364b80 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:49:01 +0100 Subject: esp4: split esp_output_udp_encap and introduce esp_output_encap Co-developed-by: Herbert Xu Signed-off-by: Herbert Xu Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 57 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index c5d826642229..033c61d27148 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -225,45 +225,62 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) { - int encap_type; struct udphdr *uh; __be32 *udpdata32; - __be16 sport, dport; - struct xfrm_encap_tmpl *encap = x->encap; - struct ip_esp_hdr *esph = esp->esph; unsigned int len; - spin_lock_bh(&x->lock); - sport = encap->encap_sport; - dport = encap->encap_dport; - encap_type = encap->encap_type; - spin_unlock_bh(&x->lock); - len = skb->len + esp->tailen - skb_transport_offset(skb); if (len + sizeof(struct iphdr) >= IP_MAX_MTU) - return -EMSGSIZE; + return ERR_PTR(-EMSGSIZE); - uh = (struct udphdr *)esph; + uh = (struct udphdr *)esp->esph; uh->source = sport; uh->dest = dport; uh->len = htons(len); uh->check = 0; + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - esph = (struct ip_esp_hdr *)(uh + 1); - break; case UDP_ENCAP_ESPINUDP_NON_IKE: - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - esph = (struct ip_esp_hdr *)(udpdata32 + 2); + esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); break; } - *skb_mac_header(skb) = IPPROTO_UDP; + if (IS_ERR(esph)) + return PTR_ERR(esph); + esp->esph = esph; return 0; @@ -281,7 +298,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { - int err = esp_output_udp_encap(x, skb, esp); + int err = esp_output_encap(x, skb, esp); if (err < 0) return err; -- cgit v1.2.3 From e27cca96cd68fa2c6814c90f9a1cfd36bb68c593 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Nov 2019 14:49:02 +0100 Subject: xfrm: add espintcp (RFC 8229) TCP encapsulation of IKE and IPsec messages (RFC 8229) is implemented as a TCP ULP, overriding in particular the sendmsg and recvmsg operations. A Stream Parser is used to extract messages out of the TCP stream using the first 2 bytes as length marker. Received IKE messages are put on "ike_queue", waiting to be dequeued by the custom recvmsg implementation. Received ESP messages are sent to XFRM, like with UDP encapsulation. Some of this code is taken from the original submission by Herbert Xu. Currently, only IPv4 is supported, like for UDP encapsulation. Co-developed-by: Herbert Xu Signed-off-by: Herbert Xu Signed-off-by: Sabrina Dubroca Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/espintcp.h | 39 ++++ include/net/xfrm.h | 1 + include/uapi/linux/udp.h | 1 + net/ipv4/Kconfig | 11 + net/ipv4/esp4.c | 191 +++++++++++++++++- net/xfrm/Makefile | 1 + net/xfrm/espintcp.c | 509 +++++++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_policy.c | 7 + net/xfrm/xfrm_state.c | 3 + 9 files changed, 760 insertions(+), 3 deletions(-) create mode 100644 include/net/espintcp.h create mode 100644 net/xfrm/espintcp.c (limited to 'net') diff --git a/include/net/espintcp.h b/include/net/espintcp.h new file mode 100644 index 000000000000..dd7026a00066 --- /dev/null +++ b/include/net/espintcp.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_ESPINTCP_H +#define _NET_ESPINTCP_H + +#include +#include + +void __init espintcp_init(void); + +int espintcp_push_skb(struct sock *sk, struct sk_buff *skb); +int espintcp_queue_out(struct sock *sk, struct sk_buff *skb); +bool tcp_is_ulp_esp(struct sock *sk); + +struct espintcp_msg { + struct sk_buff *skb; + struct sk_msg skmsg; + int offset; + int len; +}; + +struct espintcp_ctx { + struct strparser strp; + struct sk_buff_head ike_queue; + struct sk_buff_head out_queue; + struct espintcp_msg partial; + void (*saved_data_ready)(struct sock *sk); + void (*saved_write_space)(struct sock *sk); + struct work_struct work; + bool tx_running; +}; + +static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* RCU is only needed for diag */ + return (__force void *)icsk->icsk_ulp_data; +} +#endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 56ff86621bb4..8f71c111e65a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -193,6 +193,7 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; + struct sock __rcu *encap_sk; /* Data for care-of address */ xfrm_address_t *coaddr; diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index 30baccb6c9c4..4828794efcf8 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -42,5 +42,6 @@ struct udphdr { #define UDP_ENCAP_GTP0 4 /* GSM TS 09.60 */ #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ #define UDP_ENCAP_RXRPC 6 +#define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ #endif /* _UAPI_LINUX_UDP_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index fc816b187170..f96bd489b362 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -378,6 +378,17 @@ config INET_ESP_OFFLOAD If unsure, say N. +config INET_ESPINTCP + bool "IP: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET_ESP + select STREAM_PARSER + select NET_SOCK_MSG + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv4 sockets. + + If unsure, say N. + config INET_IPCOMP tristate "IP: IPComp transformation" select INET_XFRM_TUNNEL diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 033c61d27148..103c7d599a3c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include @@ -117,6 +119,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4, + dport, x->props.saddr.a4, sport, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -147,7 +275,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -236,7 +368,7 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); - if (len + sizeof(struct iphdr) >= IP_MAX_MTU) + if (len + sizeof(struct iphdr) > IP_MAX_MTU) return ERR_PTR(-EMSGSIZE); uh = (struct udphdr *)esp->esph; @@ -256,6 +388,41 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, return (struct ip_esp_hdr *)(uh + 1); } +#ifdef CONFIG_INET_ESPINTCP +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { @@ -276,6 +443,9 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); break; + case TCP_ENCAP_ESPINTCP: + esph = esp_output_tcp_encap(x, skb, esp); + break; } if (IS_ERR(esph)) @@ -296,7 +466,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct sk_buff *trailer; int tailen = esp->tailen; - /* this is non-NULL only with UDP Encapsulation */ + /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { int err = esp_output_encap(x, skb, esp); @@ -491,6 +661,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -617,10 +790,14 @@ int esp_input_done2(struct sk_buff *skb, int err) if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; + struct tcphdr *th = (void *)(skb_network_header(skb) + ihl); struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); __be16 source; switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; @@ -1017,6 +1194,14 @@ static int esp_init_state(struct xfrm_state *x) case UDP_ENCAP_ESPINUDP_NON_IKE: x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); break; +#ifdef CONFIG_INET_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif } } diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index fbc4552d17b8..212a4fcb4a88 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o +obj-$(CONFIG_INET_ESPINTCP) += espintcp.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c new file mode 100644 index 000000000000..f15d6a564b0e --- /dev/null +++ b/net/xfrm/espintcp.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, + struct sock *sk) +{ + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf || + !sk_rmem_schedule(sk, skb, skb->truesize)) { + kfree_skb(skb); + return; + } + + skb_set_owner_r(skb, sk); + + memset(skb->cb, 0, sizeof(skb->cb)); + skb_queue_tail(&ctx->ike_queue, skb); + ctx->saved_data_ready(sk); +} + +static void handle_esp(struct sk_buff *skb, struct sock *sk) +{ + skb_reset_transport_header(skb); + memset(skb->cb, 0, sizeof(skb->cb)); + + rcu_read_lock(); + skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + local_bh_disable(); + xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); + local_bh_enable(); + rcu_read_unlock(); +} + +static void espintcp_rcv(struct strparser *strp, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = container_of(strp, struct espintcp_ctx, + strp); + struct strp_msg *rxm = strp_msg(skb); + u32 nonesp_marker; + int err; + + err = skb_copy_bits(skb, rxm->offset + 2, &nonesp_marker, + sizeof(nonesp_marker)); + if (err < 0) { + kfree_skb(skb); + return; + } + + /* remove header, leave non-ESP marker/SPI */ + if (!__pskb_pull(skb, rxm->offset + 2)) { + kfree_skb(skb); + return; + } + + if (pskb_trim(skb, rxm->full_len - 2) != 0) { + kfree_skb(skb); + return; + } + + if (nonesp_marker == 0) + handle_nonesp(ctx, skb, strp->sk); + else + handle_esp(skb, strp->sk); +} + +static int espintcp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + __be16 blen; + u16 len; + int err; + + if (skb->len < rxm->offset + 2) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); + if (err < 0) + return err; + + len = be16_to_cpu(blen); + if (len < 6) + return -EINVAL; + + return len; +} + +static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct sk_buff *skb; + int err = 0; + int copied; + int off = 0; + + flags |= nonblock ? MSG_DONTWAIT : 0; + + skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, NULL, &off, &err); + if (!skb) + return err; + + copied = len; + if (copied > skb->len) + copied = skb->len; + else if (copied < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + if (flags & MSG_TRUNC) + copied = skb->len; + kfree_skb(skb); + return copied; +} + +int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + if (skb_queue_len(&ctx->out_queue) >= netdev_max_backlog) + return -ENOBUFS; + + __skb_queue_tail(&ctx->out_queue, skb); + + return 0; +} +EXPORT_SYMBOL_GPL(espintcp_queue_out); + +/* espintcp length field is 2B and length includes the length field's size */ +#define MAX_ESPINTCP_MSG (((1 << 16) - 1) - 2) + +static int espintcp_sendskb_locked(struct sock *sk, struct espintcp_msg *emsg, + int flags) +{ + do { + int ret; + + ret = skb_send_sock_locked(sk, emsg->skb, + emsg->offset, emsg->len); + if (ret < 0) + return ret; + + emsg->len -= ret; + emsg->offset += ret; + } while (emsg->len > 0); + + kfree_skb(emsg->skb); + memset(emsg, 0, sizeof(*emsg)); + + return 0; +} + +static int espintcp_sendskmsg_locked(struct sock *sk, + struct espintcp_msg *emsg, int flags) +{ + struct sk_msg *skmsg = &emsg->skmsg; + struct scatterlist *sg; + int done = 0; + int ret; + + flags |= MSG_SENDPAGE_NOTLAST; + sg = &skmsg->sg.data[skmsg->sg.start]; + do { + size_t size = sg->length - emsg->offset; + int offset = sg->offset + emsg->offset; + struct page *p; + + emsg->offset = 0; + + if (sg_is_last(sg)) + flags &= ~MSG_SENDPAGE_NOTLAST; + + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret < 0) { + emsg->offset = offset - sg->offset; + skmsg->sg.start += done; + return ret; + } + + if (ret != size) { + offset += ret; + size -= ret; + goto retry; + } + + done++; + put_page(p); + sk_mem_uncharge(sk, sg->length); + sg = sg_next(sg); + } while (sg); + + memset(emsg, 0, sizeof(*emsg)); + + return 0; +} + +static int espintcp_push_msgs(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + int err; + + if (!emsg->len) + return 0; + + if (ctx->tx_running) + return -EAGAIN; + ctx->tx_running = 1; + + if (emsg->skb) + err = espintcp_sendskb_locked(sk, emsg, 0); + else + err = espintcp_sendskmsg_locked(sk, emsg, 0); + if (err == -EAGAIN) { + ctx->tx_running = 0; + return 0; + } + if (!err) + memset(emsg, 0, sizeof(*emsg)); + + ctx->tx_running = 0; + + return err; +} + +int espintcp_push_skb(struct sock *sk, struct sk_buff *skb) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + unsigned int len; + int offset; + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + return -ECONNRESET; + } + + offset = skb_transport_offset(skb); + len = skb->len - offset; + + espintcp_push_msgs(sk); + + if (emsg->len) { + kfree_skb(skb); + return -ENOBUFS; + } + + skb_set_owner_w(skb, sk); + + emsg->offset = offset; + emsg->len = len; + emsg->skb = skb; + + espintcp_push_msgs(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(espintcp_push_skb); + +static int espintcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + struct iov_iter pfx_iter; + struct kvec pfx_iov = {}; + size_t msglen = size + 2; + char buf[2] = {0}; + int err, end; + + if (msg->msg_flags) + return -EOPNOTSUPP; + + if (size > MAX_ESPINTCP_MSG) + return -EMSGSIZE; + + if (msg->msg_controllen) + return -EOPNOTSUPP; + + lock_sock(sk); + + err = espintcp_push_msgs(sk); + if (err < 0) { + err = -ENOBUFS; + goto unlock; + } + + sk_msg_init(&emsg->skmsg); + while (1) { + /* only -ENOMEM is possible since we don't coalesce */ + err = sk_msg_alloc(sk, &emsg->skmsg, msglen, 0); + if (!err) + break; + + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto fail; + } + + *((__be16 *)buf) = cpu_to_be16(msglen); + pfx_iov.iov_base = buf; + pfx_iov.iov_len = sizeof(buf); + iov_iter_kvec(&pfx_iter, WRITE, &pfx_iov, 1, pfx_iov.iov_len); + + err = sk_msg_memcopy_from_iter(sk, &pfx_iter, &emsg->skmsg, + pfx_iov.iov_len); + if (err < 0) + goto fail; + + err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, &emsg->skmsg, size); + if (err < 0) + goto fail; + + end = emsg->skmsg.sg.end; + emsg->len = size; + sk_msg_iter_var_prev(end); + sg_mark_end(sk_msg_elem(&emsg->skmsg, end)); + + tcp_rate_check_app_limited(sk); + + err = espintcp_push_msgs(sk); + /* this message could be partially sent, keep it */ + if (err < 0) + goto unlock; + release_sock(sk); + + return size; + +fail: + sk_msg_free(sk, &emsg->skmsg); + memset(emsg, 0, sizeof(*emsg)); +unlock: + release_sock(sk); + return err; +} + +static struct proto espintcp_prot __ro_after_init; +static struct proto_ops espintcp_ops __ro_after_init; + +static void espintcp_data_ready(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + strp_data_ready(&ctx->strp); +} + +static void espintcp_tx_work(struct work_struct *work) +{ + struct espintcp_ctx *ctx = container_of(work, + struct espintcp_ctx, work); + struct sock *sk = ctx->strp.sk; + + lock_sock(sk); + if (!ctx->tx_running) + espintcp_push_msgs(sk); + release_sock(sk); +} + +static void espintcp_write_space(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + schedule_work(&ctx->work); + ctx->saved_write_space(sk); +} + +static void espintcp_destruct(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + kfree(ctx); +} + +bool tcp_is_ulp_esp(struct sock *sk) +{ + return sk->sk_prot == &espintcp_prot; +} +EXPORT_SYMBOL_GPL(tcp_is_ulp_esp); + +static int espintcp_init_sk(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct strp_callbacks cb = { + .rcv_msg = espintcp_rcv, + .parse_msg = espintcp_parse, + }; + struct espintcp_ctx *ctx; + int err; + + /* sockmap is not compatible with espintcp */ + if (sk->sk_user_data) + return -EBUSY; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + err = strp_init(&ctx->strp, sk, &cb); + if (err) + goto free; + + __sk_dst_reset(sk); + + strp_check_rcv(&ctx->strp); + skb_queue_head_init(&ctx->ike_queue); + skb_queue_head_init(&ctx->out_queue); + sk->sk_prot = &espintcp_prot; + sk->sk_socket->ops = &espintcp_ops; + ctx->saved_data_ready = sk->sk_data_ready; + ctx->saved_write_space = sk->sk_write_space; + sk->sk_data_ready = espintcp_data_ready; + sk->sk_write_space = espintcp_write_space; + sk->sk_destruct = espintcp_destruct; + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_WORK(&ctx->work, espintcp_tx_work); + + /* avoid using task_frag */ + sk->sk_allocation = GFP_ATOMIC; + + return 0; + +free: + kfree(ctx); + return err; +} + +static void espintcp_release(struct sock *sk) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct sk_buff_head queue; + struct sk_buff *skb; + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&ctx->out_queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + espintcp_push_skb(sk, skb); + + tcp_release_cb(sk); +} + +static void espintcp_close(struct sock *sk, long timeout) +{ + struct espintcp_ctx *ctx = espintcp_getctx(sk); + struct espintcp_msg *emsg = &ctx->partial; + + strp_stop(&ctx->strp); + + sk->sk_prot = &tcp_prot; + barrier(); + + cancel_work_sync(&ctx->work); + strp_done(&ctx->strp); + + skb_queue_purge(&ctx->out_queue); + skb_queue_purge(&ctx->ike_queue); + + if (emsg->len) { + if (emsg->skb) + kfree_skb(emsg->skb); + else + sk_msg_free(sk, &emsg->skmsg); + } + + tcp_close(sk, timeout); +} + +static __poll_t espintcp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + __poll_t mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct espintcp_ctx *ctx = espintcp_getctx(sk); + + if (!skb_queue_empty(&ctx->ike_queue)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static struct tcp_ulp_ops espintcp_ulp __read_mostly = { + .name = "espintcp", + .owner = THIS_MODULE, + .init = espintcp_init_sk, +}; + +void __init espintcp_init(void) +{ + memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot)); + memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops)); + espintcp_prot.sendmsg = espintcp_sendmsg; + espintcp_prot.recvmsg = espintcp_recvmsg; + espintcp_prot.close = espintcp_close; + espintcp_prot.release_cb = espintcp_release; + espintcp_ops.poll = espintcp_poll; + + tcp_register_ulp(&espintcp_ulp); +} diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f2d1e573ea55..297d1eb79e5c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -39,6 +39,9 @@ #ifdef CONFIG_XFRM_STATISTICS #include #endif +#ifdef CONFIG_INET_ESPINTCP +#include +#endif #include "xfrm_hash.h" @@ -4157,6 +4160,10 @@ void __init xfrm_init(void) seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); +#ifdef CONFIG_INET_ESPINTCP + espintcp_init(); +#endif + RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f3423562d933..170d6e7f31d3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -670,6 +670,9 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); + if (x->encap_sk) + sock_put(rcu_dereference_raw(x->encap_sk)); + xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. -- cgit v1.2.3 From 4e7696d90b51a1a73ce0e8174f3aff58b914619c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 9 Dec 2019 13:45:18 +0800 Subject: sctp: get netns from asoc and ep base Commit 312434617cb1 ("sctp: cache netns in sctp_ep_common") set netns in asoc and ep base since they're created, and it will never change. It's a better way to get netns from asoc and ep base, comparing to calling sock_net(). This patch is to replace them. v1->v2: - no change. Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/associola.c | 10 +++++----- net/sctp/chunk.c | 2 +- net/sctp/endpointola.c | 6 +++--- net/sctp/input.c | 5 ++--- net/sctp/output.c | 2 +- net/sctp/outqueue.c | 6 +++--- net/sctp/sm_make_chunk.c | 7 +++---- net/sctp/sm_sideeffect.c | 16 ++++++---------- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 12 +++++------- net/sctp/stream.c | 3 +-- net/sctp/stream_interleave.c | 23 ++++++++++------------- net/sctp/transport.c | 2 +- net/sctp/ulpqueue.c | 15 +++++++-------- 14 files changed, 49 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bbd5004a5d09..437079a4883d 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -584,7 +584,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, const gfp_t gfp, const int peer_state) { - struct net *net = sock_net(asoc->base.sk); struct sctp_transport *peer; struct sctp_sock *sp; unsigned short port; @@ -614,7 +613,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } - peer = sctp_transport_new(net, addr, gfp); + peer = sctp_transport_new(asoc->base.net, addr, gfp); if (!peer) return NULL; @@ -974,7 +973,7 @@ static void sctp_assoc_bh_rcv(struct work_struct *work) struct sctp_association *asoc = container_of(work, struct sctp_association, base.inqueue.immediate); - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; union sctp_subtype subtype; struct sctp_endpoint *ep; struct sctp_chunk *chunk; @@ -1442,7 +1441,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Should we send a SACK to update our peer? */ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) { - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; + switch (asoc->state) { case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: @@ -1576,7 +1576,7 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, if (asoc->peer.ipv6_address) flags |= SCTP_ADDR6_PEERSUPP; - return sctp_bind_addr_copy(sock_net(asoc->base.sk), + return sctp_bind_addr_copy(asoc->base.net, &asoc->base.bind_addr, &asoc->ep->base.bind_addr, scope, gfp, flags); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index cc3ce5d80b08..ab6a997e222f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -225,7 +225,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) - SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 3ccab7440c9e..48c9c2c7602f 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -244,7 +244,7 @@ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct sctp_endpoint *retval = NULL; if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) && - net_eq(sock_net(ep->base.sk), net)) { + net_eq(ep->base.net, net)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; @@ -292,8 +292,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct sctp_sockaddr_entry *addr; + struct net *net = ep->base.net; struct sctp_bind_addr *bp; - struct net *net = sock_net(ep->base.sk); bp = &ep->base.bind_addr; /* This function is called with the socket lock held, @@ -384,7 +384,7 @@ normal: if (asoc && sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { - SCTP_INC_STATS(sock_net(ep->base.sk), SCTP_MIB_INCTRLCHUNKS); + SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS); if (asoc) asoc->stats.ictrlchunks++; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 4d2bcfc9d7f8..efaaefc3bb1c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -937,7 +937,7 @@ int sctp_hash_transport(struct sctp_transport *t) if (t->asoc->temp) return 0; - arg.net = sock_net(t->asoc->base.sk); + arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); @@ -1004,12 +1004,11 @@ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { - struct net *net = sock_net(ep->base.sk); struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, - .net = net, + .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; diff --git a/net/sctp/output.c b/net/sctp/output.c index dbda7e7927fd..1441eaf460bb 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -282,7 +282,7 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, sctp_chunk_free(sack); goto out; } - SCTP_INC_STATS(sock_net(asoc->base.sk), + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 0dab62b67b9a..a031d1134c60 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -279,7 +279,7 @@ void sctp_outq_free(struct sctp_outq *q) /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { - struct net *net = sock_net(q->asoc->base.sk); + struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? @@ -533,7 +533,7 @@ void sctp_retransmit_mark(struct sctp_outq *q, void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { - struct net *net = sock_net(q->asoc->base.sk); + struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: @@ -1884,6 +1884,6 @@ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); - SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 48d63956a68c..09050c1d5517 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2307,7 +2307,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; @@ -2363,8 +2362,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * also give us an option to silently ignore the packet, which * is what we'll do here. */ - if (!net->sctp.addip_noauth && - (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { + if (!asoc->base.net->sctp.addip_noauth && + (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); @@ -2491,9 +2490,9 @@ static int sctp_process_param(struct sctp_association *asoc, const union sctp_addr *peer_addr, gfp_t gfp) { - struct net *net = sock_net(asoc->base.sk); struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; + struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index acd737d4c0e0..ce82699d0dca 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -516,8 +516,6 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, struct sctp_transport *transport, int is_hb) { - struct net *net = sock_net(asoc->base.sk); - /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ @@ -544,10 +542,10 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ - if (net->sctp.pf_enable && - (transport->state == SCTP_ACTIVE) && - (transport->error_count < transport->pathmaxrxt) && - (transport->error_count > transport->pf_retrans)) { + if (asoc->base.net->sctp.pf_enable && + transport->state == SCTP_ACTIVE && + transport->error_count < transport->pathmaxrxt && + transport->error_count > transport->pf_retrans) { sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_PF, @@ -798,10 +796,8 @@ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds, int err = 0; if (sctp_outq_sack(&asoc->outqueue, chunk)) { - struct net *net = sock_net(asoc->base.sk); - /* There are no more TSNs awaiting SACK. */ - err = sctp_do_sm(net, SCTP_EVENT_T_OTHER, + err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), asoc->state, asoc->ep, asoc, NULL, GFP_ATOMIC); @@ -834,7 +830,7 @@ static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_association *new) { - struct net *net = sock_net(asoc->base.sk); + struct net *net = asoc->base.net; struct sctp_chunk *abort; if (!sctp_assoc_update(asoc, new)) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4ab8208a2dd4..42558fa3f3e4 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1320,7 +1320,7 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, struct sctp_chunk *init, struct sctp_cmd_seq *commands) { - struct net *net = sock_net(new_asoc->base.sk); + struct net *net = new_asoc->base.net; struct sctp_transport *new_addr; int ret = 1; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0b485952a71c..1b56fc440606 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -436,8 +436,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct net *net = sock_net(asoc->base.sk); - int retval = 0; + int retval = 0; /* If there is an outstanding ASCONF chunk, queue it for later * transmission. @@ -449,7 +448,7 @@ static int sctp_send_asconf(struct sctp_association *asoc, /* Hold the chunk until an ASCONF_ACK is received. */ sctp_chunk_hold(chunk); - retval = sctp_primitive_ASCONF(net, asoc, chunk); + retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); else @@ -2428,9 +2427,8 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, int error; if (params->spp_flags & SPP_HB_DEMAND && trans) { - struct net *net = sock_net(trans->asoc->base.sk); - - error = sctp_primitive_REQUESTHEARTBEAT(net, trans->asoc, trans); + error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net, + trans->asoc, trans); if (error) return error; } @@ -5364,7 +5362,7 @@ struct sctp_transport *sctp_transport_get_next(struct net *net, if (!sctp_transport_hold(t)) continue; - if (net_eq(sock_net(t->asoc->base.sk), net) && + if (net_eq(t->asoc->base.net, net) && t->asoc->peer.primary_path == t) break; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index e83cdaa2ab76..df60b5ef24cb 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -218,10 +218,9 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct net *net = sock_net(asoc->base.sk); int retval = 0; - retval = sctp_primitive_RECONF(net, asoc, chunk); + retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 40c40be23fcb..6b13f737ebf2 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -241,9 +241,8 @@ out: if (!first_frag) return NULL; - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, - last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { @@ -326,7 +325,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { @@ -337,8 +336,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( goto out; found: - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), - &ulpq->reasm, + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -630,7 +628,7 @@ out: if (!first_frag) return NULL; - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { @@ -716,7 +714,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { @@ -727,8 +725,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( goto out; found: - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), - &ulpq->reasm_uo, + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -814,7 +811,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) return NULL; out: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { @@ -921,7 +918,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) return NULL; out: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { @@ -1159,7 +1156,7 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); - SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS); + SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 7235a6032671..f4de064598f8 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -334,7 +334,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp); if (tp->rttvar || tp->srtt) { - struct net *net = sock_net(tp->asoc->base.sk); + struct net *net = tp->asoc->base.net; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index b6536b7f14c0..1c6c640607c5 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -486,10 +486,9 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul cevent = sctp_skb2event(pd_first); pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { - retval = sctp_make_reassembled_event(sock_net(asoc->base.sk), + retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, - pd_first, - pd_last); + pd_first, pd_last); if (retval) sctp_ulpq_set_pd(ulpq); } @@ -497,7 +496,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul done: return retval; found: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; @@ -563,8 +562,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) * further. */ done: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; @@ -664,8 +663,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) * further. */ done: - retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk), - &ulpq->reasm, first_frag, last_frag); + retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, + first_frag, last_frag); return retval; } -- cgit v1.2.3 From 65e6d90168f3593df0ae598502bcbf20d78ff0fb Mon Sep 17 00:00:00 2001 From: "Kevin(Yudong) Yang" Date: Mon, 9 Dec 2019 14:19:59 -0500 Subject: net-tcp: Disable TCP ssthresh metrics cache by default This patch introduces a sysctl knob "net.ipv4.tcp_no_ssthresh_metrics_save" that disables TCP ssthresh metrics cache by default. Other parts of TCP metrics cache, e.g. rtt, cwnd, remain unchanged. As modern networks becoming more and more dynamic, TCP metrics cache today often causes more harm than benefits. For example, the same IP address is often shared by different subscribers behind NAT in residential networks. Even if the IP address is not shared by different users, caching the slow-start threshold of a previous short flow using loss-based congestion control (e.g. cubic) often causes the future longer flows of the same network path to exit slow-start prematurely with abysmal throughput. Caching ssthresh is very risky and can lead to terrible performance. Therefore it makes sense to make disabling ssthresh caching by default and opt-in for specific networks by the administrators. This practice also has worked well for several years of deployment with CUBIC congestion control at Google. Acked-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: Kevin(Yudong) Yang Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 ++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_metrics.c | 13 +++++++++---- 5 files changed, 24 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index fd26788e8c96..90b8f14bc41a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -479,6 +479,10 @@ tcp_no_metrics_save - BOOLEAN degradation. If set, TCP will not cache metrics on closing connections. +tcp_no_ssthresh_metrics_save - BOOLEAN + Controls whether TCP saves ssthresh metrics in the route cache. + Default is 1, which disables ssthresh metrics. + tcp_orphan_retries - INTEGER This value influences the timeout of a locally closed TCP connection, when RTO retransmissions remain unacknowledged. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c0c0791b1912..08b98414d94e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -154,6 +154,7 @@ struct netns_ipv4 { int sysctl_tcp_adv_win_scale; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; + int sysctl_tcp_no_ssthresh_metrics_save; int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; int sysctl_tcp_workaround_signed_windows; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fcb2cd167f64..9684af02e0a5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1192,6 +1192,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_no_ssthresh_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 92282f98dc82..26637fce324d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2674,6 +2674,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c4848e7a0aad..279db8822439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk) if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tp->snd_cwnd >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk) } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { @@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ @@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk) if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); - val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ? + 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) -- cgit v1.2.3 From 02288248b051ae9a9438278148f44bdfb0a4068b Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 10 Dec 2019 00:52:44 +0100 Subject: tipc: eliminate gap indicator from ACK messages When we increase the link send window we sometimes observe the following scenario: 1) A packet #N arrives out of order far ahead of a sequence of older packets which are still under way. The packet is added to the deferred queue. 2) The missing packets arrive in sequence, and for each 16th of them an ACK is sent back to the receiver, as it should be. 3) When building those ACK messages, it is checked if there is a gap between the link's 'rcv_nxt' and the first packet in the deferred queue. This is always the case until packet number #N-1 arrives, and a 'gap' indicator is added, effectively turning them into NACK messages. 4) When those NACKs arrive at the sender, all the requested retransmissions are done, since it is a first-time request. This sometimes leads to a huge amount of redundant retransmissions, causing a drop in max throughput. This problem gets worse when we in a later commit introduce variable window congestion control, since it drops the link back to 'fast recovery' much more often than necessary. We now fix this by not sending any 'gap' indicator in regular ACK messages. We already have a mechanism for sending explicit NACKs in place, and this is sufficient to keep up the packet flow. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 24d4d10756d3..6d864461f83c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1521,7 +1521,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { u32 def_cnt = ++l->stats.deferred_recv; - u32 defq_len = skb_queue_len(&l->deferdq); + struct sk_buff_head *dfq = &l->deferdq; + u32 defq_len = skb_queue_len(dfq); int match1, match2; if (link_is_bc_rcvlink(l)) { @@ -1532,8 +1533,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, return 0; } - if (defq_len >= 3 && !((defq_len - 3) % 16)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); + if (defq_len >= 3 && !((defq_len - 3) % 16)) { + u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; + + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, + rcvgap, 0, 0, xmitq); + } return 0; } @@ -1631,7 +1636,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) return; - if (!skb_queue_empty(dfq)) + if ((probe || probe_reply) && !skb_queue_empty(dfq)) rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, @@ -2079,7 +2084,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); /* If NACK, retransmit will now start at right position */ -- cgit v1.2.3 From d3b09995ab930df225929b4153b7187f1bb8a396 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 10 Dec 2019 00:52:45 +0100 Subject: tipc: eliminate more unnecessary nacks and retransmissions When we increase the link tranmsit window we often observe the following scenario: 1) A STATE message bypasses a sequence of traffic packets and arrives far ahead of those to the receiver. STATE messages contain a 'peers_nxt_snt' field to indicate which was the last packet sent from the peer. This mechanism is intended as a last resort for the receiver to detect missing packets, e.g., during very low traffic when there is no packet flow to help early loss detection. 3) The receiving link compares the 'peer_nxt_snt' field to its own 'rcv_nxt', finds that there is a gap, and immediately sends a NACK message back to the peer. 4) When this NACKs arrives at the sender, all the requested retransmissions are performed, since it is a first-time request. Just like in the scenario described in the previous commit this leads to many redundant retransmissions, with decreased throughput as a consequence. We fix this by adding two more conditions before we send a NACK in this sitution. First, the deferred queue must be empty, so we cannot assume that the potential packet loss has already been detected by other means. Second, we check the 'peers_snd_nxt' field only in probe/ probe_reply messages, thus turning this into a true mechanism of last resort as it was really meant to be. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 6d864461f83c..3528181fd0f3 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2079,8 +2079,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ - if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) + if ((reply || msg_is_keepalive(hdr)) && + more(peers_snd_nxt, rcv_nxt) && + !tipc_link_is_synching(l) && + skb_queue_empty(&l->deferdq)) rcvgap = peers_snd_nxt - l->rcv_nxt; + if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); -- cgit v1.2.3 From 16ad3f4022bb53c7541a0bf0410b32d0231ebef9 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Tue, 10 Dec 2019 00:52:46 +0100 Subject: tipc: introduce variable window congestion control We introduce a simple variable window congestion control for links. The algorithm is inspired by the Reno algorithm, covering both 'slow start', 'congestion avoidance', and 'fast recovery' modes. - We introduce hard lower and upper window limits per link, still different and configurable per bearer type. - We introduce a 'slow start theshold' variable, initially set to the maximum window size. - We let a link start at the minimum congestion window, i.e. in slow start mode, and then let is grow rapidly (+1 per rceived ACK) until it reaches the slow start threshold and enters congestion avoidance mode. - In congestion avoidance mode we increment the congestion window for each window-size number of acked packets, up to a possible maximum equal to the configured maximum window. - For each non-duplicate NACK received, we drop back to fast recovery mode, by setting the both the slow start threshold to and the congestion window to (current_congestion_window / 2). - If the timeout handler finds that the transmit queue has not moved since the previous timeout, it drops the link back to slow start and forces a probe containing the last sent sequence number to the sent to the peer, so that this can discover the stale situation. This change does in reality have effect only on unicast ethernet transport, as we have seen that there is no room whatsoever for increasing the window max size for the UDP bearer. For now, we also choose to keep the limits for the broadcast link unchanged and equal. This algorithm seems to give a 50-100% throughput improvement for messages larger than MTU. Suggested-by: Xin Long Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 11 ++-- net/tipc/bearer.c | 11 ++-- net/tipc/bearer.h | 6 +- net/tipc/eth_media.c | 3 +- net/tipc/ib_media.c | 5 +- net/tipc/link.c | 175 +++++++++++++++++++++++++++++++++++---------------- net/tipc/link.h | 9 +-- net/tipc/node.c | 16 ++--- net/tipc/udp_media.c | 3 +- 9 files changed, 160 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 55aeba681cf4..42e01e9cf893 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -562,18 +562,18 @@ int tipc_bclink_reset_stats(struct net *net) return 0; } -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; - if (limit < BCLINK_WIN_MIN) - limit = BCLINK_WIN_MIN; - if (limit > TIPC_MAX_LINK_WIN) + if (max_win < BCLINK_WIN_MIN) + max_win = BCLINK_WIN_MIN; + if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, limit); + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); tipc_bcast_unlock(net); return 0; } @@ -683,6 +683,7 @@ int tipc_bcast_init(struct net *net) if (!tipc_link_bc_create(net, 0, 0, FB_MTU, BCLINK_WIN_DEFAULT, + BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d7ec26bd739d..34ca7b789eba 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -311,7 +311,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->identity = bearer_id; b->tolerance = m->tolerance; - b->window = m->window; + b->min_win = m->min_win; + b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; @@ -796,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) @@ -1088,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; @@ -1142,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) @@ -1275,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index d0c79cc6c0c2..bc0023119da2 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -119,7 +119,8 @@ struct tipc_media { char *raw); u32 priority; u32 tolerance; - u32 window; + u32 min_win; + u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; @@ -158,7 +159,8 @@ struct tipc_bearer { struct packet_type pt; struct rcu_head rcu; u32 priority; - u32 window; + u32 min_win; + u32 max_win; u32 tolerance; u32 domain; u32 identity; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index f69a2fde9f4a..8b0bb600602d 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -92,7 +92,8 @@ struct tipc_media eth_media_info = { .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index e8c16718e3fa..7aa9ff88458d 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,6 +42,8 @@ #include "core.h" #include "bearer.h" +#define TIPC_MAX_IB_LINK_WIN 500 + /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" diff --git a/net/tipc/link.c b/net/tipc/link.c index 3528181fd0f3..94dd48cd70a3 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -164,7 +164,6 @@ struct tipc_link { struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; - u16 window; /* Reception */ u16 rcv_nxt; @@ -175,6 +174,12 @@ struct tipc_link { /* Congestion handling */ struct sk_buff_head wakeupq; + u16 window; + u16 min_win; + u16 ssthresh; + u16 max_win; + u16 cong_acks; + u16 checkpoint; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; @@ -244,12 +249,13 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); -static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); +static int tipc_link_release_pkts(struct tipc_link *l, u16 to); static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq); - +static void tipc_link_update_cwin(struct tipc_link *l, int released, + bool retransmitted); /* * Simple non-static link routines (i.e. referenced outside this file) */ @@ -308,9 +314,14 @@ u32 tipc_link_id(struct tipc_link *l) return l->peer_bearer_id << 16 | l->bearer_id; } -int tipc_link_window(struct tipc_link *l) +int tipc_link_min_win(struct tipc_link *l) { - return l->window; + return l->min_win; +} + +int tipc_link_max_win(struct tipc_link *l) +{ + return l->max_win; } int tipc_link_prio(struct tipc_link *l) @@ -436,7 +447,8 @@ u32 tipc_link_state(struct tipc_link *l) * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link - * @window: send window to be used by link + * @min_win: minimal send window to be used by link + * @max_win: maximal send window to be used by link * @session: session to be used by link * @ownnode: identity of own node * @peer: node id of peer node @@ -451,7 +463,7 @@ u32 tipc_link_state(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, + u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -495,7 +507,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; - tipc_link_set_queue_limits(l, window); + tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; @@ -523,7 +535,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * Returns true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -531,9 +543,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, { struct tipc_link *l; - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, - NULL, inputq, namedq, link)) + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, + max_win, 0, ownnode, peer, NULL, peer_caps, + bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; @@ -772,6 +784,8 @@ bool tipc_link_too_silent(struct tipc_link *l) return (l->silent_intv_cnt + 2 > l->abort_limit); } +static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, + u16 from, u16 to, struct sk_buff_head *xmitq); /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -804,6 +818,11 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) probe |= l->silent_intv_cnt; if (probe || mstate->monitoring) l->silent_intv_cnt++; + if (l->snd_nxt == l->checkpoint) { + tipc_link_update_cwin(l, 0, 0); + probe = true; + } + l->checkpoint = l->snd_nxt; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; @@ -959,7 +978,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, int pkt_cnt = skb_queue_len(list); int imp = msg_importance(hdr); unsigned int mss = tipc_link_mss(l); - unsigned int maxwin = l->window; + unsigned int cwin = l->window; unsigned int mtu = l->mtu; bool new_bundle; int rc = 0; @@ -988,7 +1007,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, /* Prepare each packet for sending, and add to relevant queue: */ while ((skb = __skb_dequeue(list))) { - if (likely(skb_queue_len(transmq) < maxwin)) { + if (likely(skb_queue_len(transmq) < cwin)) { hdr = buf_msg(skb); msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); @@ -1035,17 +1054,61 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return rc; } +static void tipc_link_update_cwin(struct tipc_link *l, int released, + bool retransmitted) +{ + int bklog_len = skb_queue_len(&l->backlogq); + struct sk_buff_head *txq = &l->transmq; + int txq_len = skb_queue_len(txq); + u16 cwin = l->window; + + /* Enter fast recovery */ + if (unlikely(retransmitted)) { + l->ssthresh = max_t(u16, l->window / 2, 300); + l->window = l->ssthresh; + return; + } + /* Enter slow start */ + if (unlikely(!released)) { + l->ssthresh = max_t(u16, l->window / 2, 300); + l->window = l->min_win; + return; + } + /* Don't increase window if no pressure on the transmit queue */ + if (txq_len + bklog_len < cwin) + return; + + /* Don't increase window if there are holes the transmit queue */ + if (txq_len && l->snd_nxt - buf_seqno(skb_peek(txq)) != txq_len) + return; + + l->cong_acks += released; + + /* Slow start */ + if (cwin <= l->ssthresh) { + l->window = min_t(u16, cwin + released, l->max_win); + return; + } + /* Congestion avoidance */ + if (l->cong_acks < cwin) + return; + l->window = min_t(u16, ++cwin, l->max_win); + l->cong_acks = 0; +} + static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + struct sk_buff_head *txq = &l->transmq; struct sk_buff *skb, *_skb; - struct tipc_msg *hdr; - u16 seqno = l->snd_nxt; u16 ack = l->rcv_nxt - 1; - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 seqno = l->snd_nxt; + struct tipc_msg *hdr; + u16 cwin = l->window; u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; @@ -1141,6 +1204,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; + int retransmitted = 0; struct tipc_msg *hdr; int rc = 0; @@ -1160,7 +1224,6 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; @@ -1173,11 +1236,12 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted++; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } + tipc_link_update_cwin(l, 0, retransmitted); return 0; } @@ -1338,9 +1402,9 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, return rc; } -static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) +static int tipc_link_release_pkts(struct tipc_link *l, u16 acked) { - bool released = false; + int released = 0; struct sk_buff *skb, *tmp; skb_queue_walk_safe(&l->transmq, skb, tmp) { @@ -1348,7 +1412,7 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) break; __skb_unlink(skb, &l->transmq); kfree_skb(skb); - released = true; + released++; } return released; } @@ -1417,8 +1481,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; bool passed = false; + u16 released = 0; u16 seqno, n = 0; int rc = 0; @@ -1430,6 +1496,7 @@ next_gap_ack: /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); + released++; } else if (less_eq(seqno, acked + gap)) { /* First, check if repeated retrans failures occurs? */ if (!passed && link_retransmit_failure(l, l, &rc)) @@ -1449,7 +1516,7 @@ next_gap_ack: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; @@ -1463,7 +1530,10 @@ next_gap_ack: goto next_gap_ack; } } - + if (released || retransmitted) + tipc_link_update_cwin(l, released, retransmitted); + if (released) + tipc_link_advance_backlog(l, xmitq); return 0; } @@ -1487,7 +1557,6 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) l->snd_nxt = l->rcv_nxt; return TIPC_LINK_SND_STATE; } - /* Unicast ACK */ l->rcv_unacked = 0; l->stats.sent_acks++; @@ -1553,6 +1622,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *defq = &l->deferdq; struct tipc_msg *hdr = buf_msg(skb); u16 seqno, rcv_nxt, win_lim; + int released = 0; int rc = 0; /* Verify and update link state */ @@ -1571,21 +1641,17 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, if (unlikely(!link_is_up(l))) { if (l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; - goto drop; + kfree_skb(skb); + break; } /* Drop if outside receive window */ if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { l->stats.duplicates++; - goto drop; - } - - /* Forward queues and wake up waiting users */ - if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { - tipc_link_advance_backlog(l, xmitq); - if (unlikely(!skb_queue_empty(&l->wakeupq))) - link_prepare_wakeup(l); + kfree_skb(skb); + break; } + released += tipc_link_release_pkts(l, msg_ack(hdr)); /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { @@ -1608,9 +1674,13 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, break; } while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt))); - return rc; -drop: - kfree_skb(skb); + /* Forward queues and wake up waiting users */ + if (released) { + tipc_link_update_cwin(l, released, 0); + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } return rc; } @@ -2084,17 +2154,13 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, !tipc_link_is_synching(l) && skb_queue_empty(&l->deferdq)) rcvgap = peers_snd_nxt - l->rcv_nxt; - if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); - /* If NACK, retransmit will now start at right position */ + rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); if (gap) l->stats.recv_nacks++; - - tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } @@ -2313,15 +2379,18 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; } -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); - l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); + l->min_win = min_win; + l->ssthresh = max_win; + l->max_win = max_win; + l->window = min_win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } @@ -2374,10 +2443,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (max_win < TIPC_DEF_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } @@ -2613,7 +2682,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index c09e9d49d0a3..d3c1c3fc1659 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,7 +73,7 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, + u32 min_win, u32 max_win, u32 session, u32 ownnode, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link); bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); -int tipc_link_window(struct tipc_link *l); +int tipc_link_min_win(struct tipc_link *l); +int tipc_link_max_win(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); diff --git a/net/tipc/node.c b/net/tipc/node.c index ab04e00cb95b..99b28b69fc17 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1139,7 +1139,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, snd_l = tipc_bc_sndlink(net); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, U16_MAX, - tipc_link_window(snd_l), + tipc_link_min_win(snd_l), + tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, @@ -1233,7 +1234,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, session, + b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -2360,8 +2361,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; - err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], - props); + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; @@ -2380,10 +2380,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, + tipc_link_min_win(link), + max_win); } } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ed113735c019..d6620ad53546 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -828,7 +828,8 @@ struct tipc_media udp_media_info = { .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, -- cgit v1.2.3 From 5000b28b0b1a34144b39376318cafb8c2a0f79fd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 10 Dec 2019 02:41:48 +0000 Subject: tcp: Cleanup duplicate initialization of sk->sk_state. When a TCP socket is created, sk->sk_state is initialized twice as TCP_CLOSE in sock_init_data() and tcp_init_sock(). The tcp_init_sock() is always called after the sock_init_data(), so it is not necessary to update sk->sk_state in the tcp_init_sock(). Before v2.1.8, the code of the two functions was in the inet_create(). In the patch of v2.1.8, the tcp_v4/v6_init_sock() were added and the code of initialization of sk->state was duplicated. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8a39ee794891..09e2cae92956 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -443,8 +443,6 @@ void tcp_init_sock(struct sock *sk) tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; - sk->sk_state = TCP_CLOSE; - sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); -- cgit v1.2.3 From b590cb5f802dc20c72f507f7fbe6737222d0afba Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 10 Dec 2019 11:19:33 -0800 Subject: bpf: Switch to offsetofend in BPF_PROG_TEST_RUN Switch existing pattern of "offsetof(..., member) + FIELD_SIZEOF(..., member)' to "offsetofend(..., member)" which does exactly what we need without all the copy-paste. Suggested-by: Andrii Nakryiko Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191210191933.105321-1-sdf@google.com --- net/bpf/test_run.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 915c2d6f7fb9..85c8cbbada92 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -252,22 +252,19 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* priority is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + - FIELD_SIZEOF(struct __sk_buff, priority), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + - FIELD_SIZEOF(struct __sk_buff, cb), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ - if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + - FIELD_SIZEOF(struct __sk_buff, tstamp), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, tstamp), sizeof(struct __sk_buff))) return -EINVAL; @@ -437,8 +434,7 @@ static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) /* flags is allowed */ - if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) + - FIELD_SIZEOF(struct bpf_flow_keys, flags), + if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; -- cgit v1.2.3 From c5144fcbf25015b850e97a06a24b1ddfbc5579e8 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:02 +0100 Subject: vsock/virtio_transport_common: remove unused virtio header includes We can remove virtio header includes, because virtio_transport_common doesn't use virtio API, but provides common functions to interface virtio/vhost transports with the af_vsock core, and to handle the protocol. Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e5ea29c6bca7..0e20b0f6eb65 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -11,9 +11,6 @@ #include #include #include -#include -#include -#include #include #include -- cgit v1.2.3 From ef343b35d46667668a099655fca4a5b2e43a5dfe Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:03 +0100 Subject: vsock: add VMADDR_CID_LOCAL definition The VMADDR_CID_RESERVED (1) was used by VMCI, but now it is not used anymore, so we can reuse it for local communication (loopback) adding the new well-know CID: VMADDR_CID_LOCAL. Cc: Jorgen Hansen Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/uapi/linux/vm_sockets.h | 8 +++++--- net/vmw_vsock/vmci_transport.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index 68d57c5e99bc..fd0ed7221645 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -99,11 +99,13 @@ #define VMADDR_CID_HYPERVISOR 0 -/* This CID is specific to VMCI and can be considered reserved (even VMCI - * doesn't use it anymore, it's a legacy value from an older release). +/* Use this as the destination CID in an address when referring to the + * local communication (loopback). + * (This was VMADDR_CID_RESERVED, but even VMCI doesn't use it anymore, + * it was a legacy value from an older release). */ -#define VMADDR_CID_RESERVED 1 +#define VMADDR_CID_LOCAL 1 /* Use this as the destination CID in an address when referring to the host * (any process other than the hypervisor). VMCI relies on it being 2, but diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 644d32e43d23..4b8b1150a738 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -648,7 +648,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) static bool vmci_transport_stream_allow(u32 cid, u32 port) { static const u32 non_socket_contexts[] = { - VMADDR_CID_RESERVED, + VMADDR_CID_LOCAL, }; int i; -- cgit v1.2.3 From 0e12190578d081d4fe54d41635ec6e5a6eb0d01a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:04 +0100 Subject: vsock: add local transport support in the vsock core This patch allows to register a transport able to handle local communication (loopback). Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ net/vmw_vsock/af_vsock.c | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 4206dc6d813f..b1c717286993 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -98,6 +98,8 @@ struct vsock_transport_send_notify_data { #define VSOCK_TRANSPORT_F_G2H 0x00000002 /* Transport provides DGRAM communication */ #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 +/* Transport provides local (loopback) communication */ +#define VSOCK_TRANSPORT_F_LOCAL 0x00000008 struct vsock_transport { struct module *module; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 74db4cd637a7..3da0749a0c97 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -136,6 +136,8 @@ static const struct vsock_transport *transport_h2g; static const struct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ static const struct vsock_transport *transport_dgram; +/* Transport used for local communication */ +static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); /**** UTILS ****/ @@ -2137,7 +2139,7 @@ EXPORT_SYMBOL_GPL(vsock_core_get_transport); int vsock_core_register(const struct vsock_transport *t, int features) { - const struct vsock_transport *t_h2g, *t_g2h, *t_dgram; + const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; int err = mutex_lock_interruptible(&vsock_register_mutex); if (err) @@ -2146,6 +2148,7 @@ int vsock_core_register(const struct vsock_transport *t, int features) t_h2g = transport_h2g; t_g2h = transport_g2h; t_dgram = transport_dgram; + t_local = transport_local; if (features & VSOCK_TRANSPORT_F_H2G) { if (t_h2g) { @@ -2171,9 +2174,18 @@ int vsock_core_register(const struct vsock_transport *t, int features) t_dgram = t; } + if (features & VSOCK_TRANSPORT_F_LOCAL) { + if (t_local) { + err = -EBUSY; + goto err_busy; + } + t_local = t; + } + transport_h2g = t_h2g; transport_g2h = t_g2h; transport_dgram = t_dgram; + transport_local = t_local; err_busy: mutex_unlock(&vsock_register_mutex); @@ -2194,6 +2206,9 @@ void vsock_core_unregister(const struct vsock_transport *t) if (transport_dgram == t) transport_dgram = NULL; + if (transport_local == t) + transport_local = NULL; + mutex_unlock(&vsock_register_mutex); } EXPORT_SYMBOL_GPL(vsock_core_unregister); -- cgit v1.2.3 From 077263fba10011d7e1d17dc7691ff20c15c56081 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:05 +0100 Subject: vsock: add vsock_loopback transport This patch adds a new vsock_loopback transport to handle local communication. This transport is based on the loopback implementation of virtio_transport, so it uses the virtio_transport_common APIs to interface with the vsock core. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- MAINTAINERS | 1 + net/vmw_vsock/Kconfig | 12 +++ net/vmw_vsock/Makefile | 1 + net/vmw_vsock/vsock_loopback.c | 180 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 194 insertions(+) create mode 100644 net/vmw_vsock/vsock_loopback.c (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index ce7fd2e7aa1a..a28c77ee6b0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17480,6 +17480,7 @@ F: net/vmw_vsock/diag.c F: net/vmw_vsock/af_vsock_tap.c F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c +F: net/vmw_vsock/vsock_loopback.c F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c F: tools/testing/vsock/ diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 8abcb815af2d..56356d2980c8 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -26,6 +26,18 @@ config VSOCKETS_DIAG Enable this module so userspace applications can query open sockets. +config VSOCKETS_LOOPBACK + tristate "Virtual Sockets loopback transport" + depends on VSOCKETS + default y + select VIRTIO_VSOCKETS_COMMON + help + This module implements a loopback transport for Virtual Sockets, + using vmw_vsock_virtio_transport_common. + + To compile this driver as a module, choose M here: the module + will be called vsock_loopback. If unsure, say N. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index 7c6f9a0b67b0..6a943ec95c4a 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o +obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c new file mode 100644 index 000000000000..a45f7ffca8c5 --- /dev/null +++ b/net/vmw_vsock/vsock_loopback.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* loopback transport for vsock using virtio_transport_common APIs + * + * Copyright (C) 2013-2019 Red Hat, Inc. + * Authors: Asias He + * Stefan Hajnoczi + * Stefano Garzarella + * + */ +#include +#include +#include +#include + +struct vsock_loopback { + struct workqueue_struct *workqueue; + + spinlock_t pkt_list_lock; /* protects pkt_list */ + struct list_head pkt_list; + struct work_struct pkt_work; +}; + +static struct vsock_loopback the_vsock_loopback; + +static u32 vsock_loopback_get_local_cid(void) +{ + return VMADDR_CID_LOCAL; +} + +static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + int len = pkt->len; + + spin_lock_bh(&vsock->pkt_list_lock); + list_add_tail(&pkt->list, &vsock->pkt_list); + spin_unlock_bh(&vsock->pkt_list_lock); + + queue_work(vsock->workqueue, &vsock->pkt_work); + + return len; +} + +static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + struct virtio_vsock_pkt *pkt, *n; + LIST_HEAD(freeme); + + spin_lock_bh(&vsock->pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + return 0; +} + +static struct virtio_transport loopback_transport = { + .transport = { + .module = THIS_MODULE, + + .get_local_cid = vsock_loopback_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + .cancel_pkt = vsock_loopback_cancel_pkt, + + .dgram_bind = virtio_transport_dgram_bind, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + .notify_buffer_size = virtio_transport_notify_buffer_size, + }, + + .send_pkt = vsock_loopback_send_pkt, +}; + +static void vsock_loopback_work(struct work_struct *work) +{ + struct vsock_loopback *vsock = + container_of(work, struct vsock_loopback, pkt_work); + LIST_HEAD(pkts); + + spin_lock_bh(&vsock->pkt_list_lock); + list_splice_init(&vsock->pkt_list, &pkts); + spin_unlock_bh(&vsock->pkt_list_lock); + + while (!list_empty(&pkts)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_recv_pkt(&loopback_transport, pkt); + } +} + +static int __init vsock_loopback_init(void) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + int ret; + + vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); + if (!vsock->workqueue) + return -ENOMEM; + + spin_lock_init(&vsock->pkt_list_lock); + INIT_LIST_HEAD(&vsock->pkt_list); + INIT_WORK(&vsock->pkt_work, vsock_loopback_work); + + ret = vsock_core_register(&loopback_transport.transport, + VSOCK_TRANSPORT_F_LOCAL); + if (ret) + goto out_wq; + + return 0; + +out_wq: + destroy_workqueue(vsock->workqueue); + return ret; +} + +static void __exit vsock_loopback_exit(void) +{ + struct vsock_loopback *vsock = &the_vsock_loopback; + struct virtio_vsock_pkt *pkt; + + vsock_core_unregister(&loopback_transport.transport); + + flush_work(&vsock->pkt_work); + + spin_lock_bh(&vsock->pkt_list_lock); + while (!list_empty(&vsock->pkt_list)) { + pkt = list_first_entry(&vsock->pkt_list, + struct virtio_vsock_pkt, list); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->pkt_list_lock); + + destroy_workqueue(vsock->workqueue); +} + +module_init(vsock_loopback_init); +module_exit(vsock_loopback_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Stefano Garzarella "); +MODULE_DESCRIPTION("loopback transport for vsock"); +MODULE_ALIAS_NETPROTO(PF_VSOCK); -- cgit v1.2.3 From 408624af4c89989117bb2c6517bd50b7708a2fcd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:06 +0100 Subject: vsock: use local transport when it is loaded Now that we have a transport that can handle the local communication, we can use it when it is loaded. A socket will use the local transport (loopback) when the remote CID is: - equal to VMADDR_CID_LOCAL - or equal to transport_g2h->get_local_cid(), if transport_g2h is loaded (this allows us to keep the same behavior implemented by virtio and vmci transports) - or equal to VMADDR_CID_HOST, if transport_g2h is not loaded Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 3da0749a0c97..9c5b2a91baad 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -388,6 +388,21 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected) } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); +static bool vsock_use_local_transport(unsigned int remote_cid) +{ + if (!transport_local) + return false; + + if (remote_cid == VMADDR_CID_LOCAL) + return true; + + if (transport_g2h) { + return remote_cid == transport_g2h->get_local_cid(); + } else { + return remote_cid == VMADDR_CID_HOST; + } +} + static void vsock_deassign_transport(struct vsock_sock *vsk) { if (!vsk->transport) @@ -404,9 +419,9 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) * (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: + * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if + * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST will use guest->host transport; - * - remote CID == local_cid (guest->host transport) will use guest->host - * transport for loopback (host->guest transports don't support loopback); * - remote CID > VMADDR_CID_HOST will use host->guest transport; */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) @@ -421,9 +436,9 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) new_transport = transport_dgram; break; case SOCK_STREAM: - if (remote_cid <= VMADDR_CID_HOST || - (transport_g2h && - remote_cid == transport_g2h->get_local_cid())) + if (vsock_use_local_transport(remote_cid)) + new_transport = transport_local; + else if (remote_cid <= VMADDR_CID_HOST) new_transport = transport_g2h; else new_transport = transport_h2g; @@ -466,6 +481,9 @@ bool vsock_find_cid(unsigned int cid) if (transport_h2g && cid == VMADDR_CID_HOST) return true; + if (transport_local && cid == VMADDR_CID_LOCAL) + return true; + return false; } EXPORT_SYMBOL_GPL(vsock_find_cid); -- cgit v1.2.3 From bf5432b1de1fcfd1d389111c350b88dd238860ba Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 10 Dec 2019 11:43:07 +0100 Subject: vsock/virtio: remove loopback handling We can remove the loopback handling from virtio_transport, because now the vsock core is able to handle local communication using the new vsock_loopback device. Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 61 ++-------------------------------------- 1 file changed, 2 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1458c5c8b64d..dfbaf6bd8b1c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -44,10 +44,6 @@ struct virtio_vsock { spinlock_t send_pkt_list_lock; struct list_head send_pkt_list; - struct work_struct loopback_work; - spinlock_t loopback_list_lock; /* protects loopback_list */ - struct list_head loopback_list; - atomic_t queued_replies; /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] @@ -86,20 +82,6 @@ out_rcu: return ret; } -static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock, - struct virtio_vsock_pkt *pkt) -{ - int len = pkt->len; - - spin_lock_bh(&vsock->loopback_list_lock); - list_add_tail(&pkt->list, &vsock->loopback_list); - spin_unlock_bh(&vsock->loopback_list_lock); - - queue_work(virtio_vsock_workqueue, &vsock->loopback_work); - - return len; -} - static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -194,7 +176,8 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) } if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { - len = virtio_transport_send_pkt_loopback(vsock, pkt); + virtio_transport_free_pkt(pkt); + len = -ENODEV; goto out_rcu; } @@ -502,33 +485,6 @@ static struct virtio_transport virtio_transport = { .send_pkt = virtio_transport_send_pkt, }; -static void virtio_transport_loopback_work(struct work_struct *work) -{ - struct virtio_vsock *vsock = - container_of(work, struct virtio_vsock, loopback_work); - LIST_HEAD(pkts); - - spin_lock_bh(&vsock->loopback_list_lock); - list_splice_init(&vsock->loopback_list, &pkts); - spin_unlock_bh(&vsock->loopback_list_lock); - - mutex_lock(&vsock->rx_lock); - - if (!vsock->rx_run) - goto out; - - while (!list_empty(&pkts)) { - struct virtio_vsock_pkt *pkt; - - pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); - list_del_init(&pkt->list); - - virtio_transport_recv_pkt(&virtio_transport, pkt); - } -out: - mutex_unlock(&vsock->rx_lock); -} - static void virtio_transport_rx_work(struct work_struct *work) { struct virtio_vsock *vsock = @@ -633,13 +589,10 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->event_lock); spin_lock_init(&vsock->send_pkt_list_lock); INIT_LIST_HEAD(&vsock->send_pkt_list); - spin_lock_init(&vsock->loopback_list_lock); - INIT_LIST_HEAD(&vsock->loopback_list); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); - INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work); mutex_lock(&vsock->tx_lock); vsock->tx_run = true; @@ -720,22 +673,12 @@ static void virtio_vsock_remove(struct virtio_device *vdev) } spin_unlock_bh(&vsock->send_pkt_list_lock); - spin_lock_bh(&vsock->loopback_list_lock); - while (!list_empty(&vsock->loopback_list)) { - pkt = list_first_entry(&vsock->loopback_list, - struct virtio_vsock_pkt, list); - list_del(&pkt->list); - virtio_transport_free_pkt(pkt); - } - spin_unlock_bh(&vsock->loopback_list_lock); - /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); /* Other works can be queued before 'config->del_vqs()', so we flush * all works before to free the vsock object to avoid use after free. */ - flush_work(&vsock->loopback_work); flush_work(&vsock->rx_work); flush_work(&vsock->tx_work); flush_work(&vsock->event_work); -- cgit v1.2.3 From b4653342b1514cb11f25b727c689451aff02996d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 9 Dec 2019 13:03:40 +0300 Subject: net: Allow to show socket-specific information in /proc/[pid]/fdinfo/[fd] This adds .show_fdinfo to socket_file_ops, so protocols will be able to print their specific data in fdinfo. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/net.h | 1 + net/socket.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/include/linux/net.h b/include/linux/net.h index 9cafb5f353a9..6451425e828f 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -171,6 +171,7 @@ struct proto_ops { int (*compat_getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); #endif + void (*show_fdinfo)(struct seq_file *m, struct socket *sock); int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len); /* Notes for implementing recvmsg: diff --git a/net/socket.c b/net/socket.c index 4d38d49d6ad9..532f123e0459 100644 --- a/net/socket.c +++ b/net/socket.c @@ -128,6 +128,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +static void sock_show_fdinfo(struct seq_file *m, struct file *f); /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear @@ -150,6 +151,9 @@ static const struct file_operations socket_file_ops = { .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, +#ifdef CONFIG_PROC_FS + .show_fdinfo = sock_show_fdinfo, +#endif }; /* @@ -993,6 +997,14 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) return res; } +static void sock_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct socket *sock = f->private_data; + + if (sock->ops->show_fdinfo) + sock->ops->show_fdinfo(m, sock); +} + /* * Atomic setting of ioctl hooks to avoid race * with module unload. -- cgit v1.2.3 From 3c32da19a858fb1ae8a76bf899160be49f338506 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 9 Dec 2019 13:03:46 +0300 Subject: unix: Show number of pending scm files of receive queue in fdinfo Unix sockets like a block box. You never know what is stored there: there may be a file descriptor holding a mount or a block device, or there may be whole universes with namespaces, sockets with receive queues full of sockets etc. The patch adds a little debug and accounts number of files (not recursive), which is in receive queue of a unix socket. Sometimes this is useful to determine, that socket should be investigated or which task should be killed to put reference counter on a resourse. v2: Pass correct argument to lockdep Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/af_unix.h | 5 +++++ net/unix/af_unix.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 3426d6dacc45..17e10fba2152 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -41,6 +41,10 @@ struct unix_skb_parms { u32 consumed; } __randomize_layout; +struct scm_stat { + u32 nr_fds; +}; + #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) @@ -65,6 +69,7 @@ struct unix_sock { #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; + struct scm_stat scm_stat; }; static inline struct unix_sock *unix_sk(const struct sock *sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7cfdce10de36..050eed3b30dc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -676,6 +676,16 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } +static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) +{ + struct sock *sk = sock->sk; + struct unix_sock *u; + + if (sk) { + u = unix_sk(sock->sk); + seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds)); + } +} static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, @@ -701,6 +711,7 @@ static const struct proto_ops unix_stream_ops = { .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { @@ -726,6 +737,7 @@ static const struct proto_ops unix_dgram_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { @@ -751,6 +763,7 @@ static const struct proto_ops unix_seqpacket_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, + .show_fdinfo = unix_show_fdinfo, }; static struct proto unix_proto = { @@ -788,6 +801,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); + memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -1572,6 +1586,28 @@ static bool unix_skb_scm_eq(struct sk_buff *skb, unix_secdata_eq(scm, skb); } +static void scm_stat_add(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds += fp->count; +} + +static void scm_stat_del(struct sock *sk, struct sk_buff *skb) +{ + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + + lockdep_assert_held(&sk->sk_receive_queue.lock); + + if (unlikely(fp && fp->count)) + u->scm_stat.nr_fds -= fp->count; +} + /* * Send AF_UNIX data. */ @@ -1757,7 +1793,10 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); @@ -1859,7 +1898,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto pipe_err_free; maybe_add_creds(skb, sock, other); - skb_queue_tail(&other->sk_receive_queue, skb); + spin_lock(&other->sk_receive_queue.lock); + scm_stat_add(other, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2058,8 +2100,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, - &last); + skb = __skb_try_recv_datagram(sk, flags, scm_stat_del, + &skip, &err, &last); if (skb) break; @@ -2353,8 +2395,12 @@ unlock: sk_peek_offset_bwd(sk, chunk); - if (UNIXCB(skb).fp) + if (UNIXCB(skb).fp) { + spin_lock(&sk->sk_receive_queue.lock); + scm_stat_del(sk, skb); + spin_unlock(&sk->sk_receive_queue.lock); unix_detach_fds(&scm, skb); + } if (unix_skb_len(skb)) break; -- cgit v1.2.3 From f74877a5457d34d604dba6dbbb13c4c05bac8b93 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 11 Dec 2019 10:58:14 +0100 Subject: rtnetlink: provide permanent hardware address in RTM_NEWLINK Permanent hardware address of a network device was traditionally provided via ethtool ioctl interface but as Jiri Pirko pointed out in a review of ethtool netlink interface, rtnetlink is much more suitable for it so let's add it to the RTM_NEWLINK message. Add IFLA_PERM_ADDRESS attribute to RTM_NEWLINK messages unless the permanent address is all zeros (i.e. device driver did not fill it). As permanent address is not modifiable, reject userspace requests containing IFLA_PERM_ADDRESS attribute. Note: we already provide permanent hardware address for bond slaves; unfortunately we cannot drop that attribute for backward compatibility reasons. v5 -> v6: only add the attribute if permanent address is not zero Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8aec8769d944..1d69f637c5d6 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -169,6 +169,7 @@ enum { IFLA_MAX_MTU, IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 02916f43bf63..20bc406f3871 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1041,6 +1041,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + 0; } @@ -1757,6 +1758,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; + if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && + nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) + goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) @@ -1822,6 +1826,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, + [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From 32d5109a9d864aea3981f0b5ea736eee4e11b42a Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 11 Dec 2019 10:58:19 +0100 Subject: netlink: rename nl80211_validate_nested() to nla_validate_nested() Function nl80211_validate_nested() is not specific to nl80211, it's a counterpart to nla_validate_nested_deprecated() with strict validation. For consistency with other validation and parse functions, rename it to nla_validate_nested(). Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 8 ++++---- net/wireless/nl80211.c | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index b140c8f1be22..56c365dc6dc7 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1735,7 +1735,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) } /** - * nla_validate_nested - Validate a stream of nested attributes + * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy @@ -1758,9 +1758,9 @@ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, } static inline int -nl80211_validate_nested(const struct nlattr *start, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +nla_validate_nested(const struct nlattr *start, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index da5262b2298b..fa3526592c51 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12900,8 +12900,7 @@ static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd, return -EINVAL; } - return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy, - extack); + return nla_validate_nested(attr, vcmd->maxattr, vcmd->policy, extack); } static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From 9ce48e5a09ea63a7867647af68caa0605d99757d Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 11 Dec 2019 10:58:24 +0100 Subject: ethtool: move to its own directory The ethtool netlink interface is going to be split into multiple files so that it will be more convenient to put all of them in a separate directory net/ethtool. Start by moving current ethtool.c with ioctl interface into this directory and renaming it to ioctl.c. Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/Makefile | 2 +- net/core/Makefile | 2 +- net/core/ethtool.c | 3116 -------------------------------------------------- net/ethtool/Makefile | 3 + net/ethtool/ioctl.c | 3116 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 3121 insertions(+), 3118 deletions(-) delete mode 100644 net/core/ethtool.c create mode 100644 net/ethtool/Makefile create mode 100644 net/ethtool/ioctl.c (limited to 'net') diff --git a/net/Makefile b/net/Makefile index 449fc0b221f8..848303d98d3d 100644 --- a/net/Makefile +++ b/net/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ diff --git a/net/core/Makefile b/net/core/Makefile index a104dc8faafc..3e2c378e5f31 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -8,7 +8,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o diff --git a/net/core/ethtool.c b/net/core/ethtool.c deleted file mode 100644 index cd9bc67381b2..000000000000 --- a/net/core/ethtool.c +++ /dev/null @@ -1,3116 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/core/ethtool.c - Ethtool ioctl handler - * Copyright (c) 2003 Matthew Wilcox - * - * This file is where we call all the ethtool_ops commands to get - * the information ethtool needs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Some useful ethtool_ops methods that're device independent. - * If we find that all drivers want to do the same thing here, - * we can turn these into dev_() function calls. - */ - -u32 ethtool_op_get_link(struct net_device *dev) -{ - return netif_carrier_ok(dev) ? 1 : 0; -} -EXPORT_SYMBOL(ethtool_op_get_link); - -int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) -{ - info->so_timestamping = - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info->phc_index = -1; - return 0; -} -EXPORT_SYMBOL(ethtool_op_get_ts_info); - -/* Handlers for each ethtool command */ - -#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) - -static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { - [NETIF_F_SG_BIT] = "tx-scatter-gather", - [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", - [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", - [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", - [NETIF_F_HIGHDMA_BIT] = "highdma", - [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", - - [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", - [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", - [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", - [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", - [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", - [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", - [NETIF_F_GRO_BIT] = "rx-gro", - [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", - [NETIF_F_LRO_BIT] = "rx-lro", - - [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", - [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", - [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", - [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", - [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", - [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", - [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", - [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", - [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", - [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", - [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", - [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", - [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", - [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", - - [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", - [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", - [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", - [NETIF_F_RXHASH_BIT] = "rx-hashing", - [NETIF_F_RXCSUM_BIT] = "rx-checksum", - [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", - [NETIF_F_LOOPBACK_BIT] = "loopback", - [NETIF_F_RXFCS_BIT] = "rx-fcs", - [NETIF_F_RXALL_BIT] = "rx-all", - [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", - [NETIF_F_HW_TC_BIT] = "hw-tc-offload", - [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", - [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", - [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", - [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", - [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", - [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", -}; - -static const char -rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { - [ETH_RSS_HASH_TOP_BIT] = "toeplitz", - [ETH_RSS_HASH_XOR_BIT] = "xor", - [ETH_RSS_HASH_CRC32_BIT] = "crc32", -}; - -static const char -tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", - [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", - [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", -}; - -static const char -phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", - [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", - [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", -}; - -static int ethtool_get_features(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_gfeatures cmd = { - .cmd = ETHTOOL_GFEATURES, - .size = ETHTOOL_DEV_FEATURE_WORDS, - }; - struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; - u32 __user *sizeaddr; - u32 copy_size; - int i; - - /* in case feature bits run out again */ - BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); - - for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { - features[i].available = (u32)(dev->hw_features >> (32 * i)); - features[i].requested = (u32)(dev->wanted_features >> (32 * i)); - features[i].active = (u32)(dev->features >> (32 * i)); - features[i].never_changed = - (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); - } - - sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); - if (get_user(copy_size, sizeaddr)) - return -EFAULT; - - if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) - copy_size = ETHTOOL_DEV_FEATURE_WORDS; - - if (copy_to_user(useraddr, &cmd, sizeof(cmd))) - return -EFAULT; - useraddr += sizeof(cmd); - if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) - return -EFAULT; - - return 0; -} - -static int ethtool_set_features(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_sfeatures cmd; - struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; - netdev_features_t wanted = 0, valid = 0; - int i, ret = 0; - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - useraddr += sizeof(cmd); - - if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) - return -EINVAL; - - if (copy_from_user(features, useraddr, sizeof(features))) - return -EFAULT; - - for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { - valid |= (netdev_features_t)features[i].valid << (32 * i); - wanted |= (netdev_features_t)features[i].requested << (32 * i); - } - - if (valid & ~NETIF_F_ETHTOOL_BITS) - return -EINVAL; - - if (valid & ~dev->hw_features) { - valid &= dev->hw_features; - ret |= ETHTOOL_F_UNSUPPORTED; - } - - dev->wanted_features &= ~valid; - dev->wanted_features |= wanted & valid; - __netdev_update_features(dev); - - if ((dev->wanted_features ^ dev->features) & valid) - ret |= ETHTOOL_F_WISH; - - return ret; -} - -static int __ethtool_get_sset_count(struct net_device *dev, int sset) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (sset == ETH_SS_FEATURES) - return ARRAY_SIZE(netdev_features_strings); - - if (sset == ETH_SS_RSS_HASH_FUNCS) - return ARRAY_SIZE(rss_hash_func_strings); - - if (sset == ETH_SS_TUNABLES) - return ARRAY_SIZE(tunable_strings); - - if (sset == ETH_SS_PHY_TUNABLES) - return ARRAY_SIZE(phy_tunable_strings); - - if (sset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - return phy_ethtool_get_sset_count(dev->phydev); - - if (ops->get_sset_count && ops->get_strings) - return ops->get_sset_count(dev, sset); - else - return -EOPNOTSUPP; -} - -static void __ethtool_get_strings(struct net_device *dev, - u32 stringset, u8 *data) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (stringset == ETH_SS_FEATURES) - memcpy(data, netdev_features_strings, - sizeof(netdev_features_strings)); - else if (stringset == ETH_SS_RSS_HASH_FUNCS) - memcpy(data, rss_hash_func_strings, - sizeof(rss_hash_func_strings)); - else if (stringset == ETH_SS_TUNABLES) - memcpy(data, tunable_strings, sizeof(tunable_strings)); - else if (stringset == ETH_SS_PHY_TUNABLES) - memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); - else if (stringset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) - phy_ethtool_get_strings(dev->phydev, data); - else - /* ops->get_strings is valid because checked earlier */ - ops->get_strings(dev, stringset, data); -} - -static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) -{ - /* feature masks of legacy discrete ethtool ops */ - - switch (eth_cmd) { - case ETHTOOL_GTXCSUM: - case ETHTOOL_STXCSUM: - return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; - case ETHTOOL_GRXCSUM: - case ETHTOOL_SRXCSUM: - return NETIF_F_RXCSUM; - case ETHTOOL_GSG: - case ETHTOOL_SSG: - return NETIF_F_SG; - case ETHTOOL_GTSO: - case ETHTOOL_STSO: - return NETIF_F_ALL_TSO; - case ETHTOOL_GGSO: - case ETHTOOL_SGSO: - return NETIF_F_GSO; - case ETHTOOL_GGRO: - case ETHTOOL_SGRO: - return NETIF_F_GRO; - default: - BUG(); - } -} - -static int ethtool_get_one_feature(struct net_device *dev, - char __user *useraddr, u32 ethcmd) -{ - netdev_features_t mask = ethtool_get_feature_mask(ethcmd); - struct ethtool_value edata = { - .cmd = ethcmd, - .data = !!(dev->features & mask), - }; - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_one_feature(struct net_device *dev, - void __user *useraddr, u32 ethcmd) -{ - struct ethtool_value edata; - netdev_features_t mask; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - mask = ethtool_get_feature_mask(ethcmd); - mask &= dev->hw_features; - if (!mask) - return -EOPNOTSUPP; - - if (edata.data) - dev->wanted_features |= mask; - else - dev->wanted_features &= ~mask; - - __netdev_update_features(dev); - - return 0; -} - -#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ - ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) -#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ - NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ - NETIF_F_RXHASH) - -static u32 __ethtool_get_flags(struct net_device *dev) -{ - u32 flags = 0; - - if (dev->features & NETIF_F_LRO) - flags |= ETH_FLAG_LRO; - if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) - flags |= ETH_FLAG_RXVLAN; - if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) - flags |= ETH_FLAG_TXVLAN; - if (dev->features & NETIF_F_NTUPLE) - flags |= ETH_FLAG_NTUPLE; - if (dev->features & NETIF_F_RXHASH) - flags |= ETH_FLAG_RXHASH; - - return flags; -} - -static int __ethtool_set_flags(struct net_device *dev, u32 data) -{ - netdev_features_t features = 0, changed; - - if (data & ~ETH_ALL_FLAGS) - return -EINVAL; - - if (data & ETH_FLAG_LRO) - features |= NETIF_F_LRO; - if (data & ETH_FLAG_RXVLAN) - features |= NETIF_F_HW_VLAN_CTAG_RX; - if (data & ETH_FLAG_TXVLAN) - features |= NETIF_F_HW_VLAN_CTAG_TX; - if (data & ETH_FLAG_NTUPLE) - features |= NETIF_F_NTUPLE; - if (data & ETH_FLAG_RXHASH) - features |= NETIF_F_RXHASH; - - /* allow changing only bits set in hw_features */ - changed = (features ^ dev->features) & ETH_ALL_FEATURES; - if (changed & ~dev->hw_features) - return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; - - dev->wanted_features = - (dev->wanted_features & ~changed) | (features & changed); - - __netdev_update_features(dev); - - return 0; -} - -/* Given two link masks, AND them together and save the result in dst. */ -void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, - struct ethtool_link_ksettings *src) -{ - unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); - unsigned int idx = 0; - - for (; idx < size; idx++) { - dst->link_modes.supported[idx] &= - src->link_modes.supported[idx]; - dst->link_modes.advertising[idx] &= - src->link_modes.advertising[idx]; - } -} -EXPORT_SYMBOL(ethtool_intersect_link_masks); - -void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, - u32 legacy_u32) -{ - bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); - dst[0] = legacy_u32; -} -EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); - -/* return false if src had higher bits set. lower bits always updated. */ -bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, - const unsigned long *src) -{ - bool retval = true; - - /* TODO: following test will soon always be true */ - if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); - - bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_fill(ext, 32); - bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); - if (bitmap_intersects(ext, src, - __ETHTOOL_LINK_MODE_MASK_NBITS)) { - /* src mask goes beyond bit 31 */ - retval = false; - } - } - *legacy_u32 = src[0]; - return retval; -} -EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); - -/* return false if legacy contained non-0 deprecated fields - * maxtxpkt/maxrxpkt. rest of ksettings always updated - */ -static bool -convert_legacy_settings_to_link_ksettings( - struct ethtool_link_ksettings *link_ksettings, - const struct ethtool_cmd *legacy_settings) -{ - bool retval = true; - - memset(link_ksettings, 0, sizeof(*link_ksettings)); - - /* This is used to tell users that driver is still using these - * deprecated legacy fields, and they should not use - * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS - */ - if (legacy_settings->maxtxpkt || - legacy_settings->maxrxpkt) - retval = false; - - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.supported, - legacy_settings->supported); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.advertising, - legacy_settings->advertising); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.lp_advertising, - legacy_settings->lp_advertising); - link_ksettings->base.speed - = ethtool_cmd_speed(legacy_settings); - link_ksettings->base.duplex - = legacy_settings->duplex; - link_ksettings->base.port - = legacy_settings->port; - link_ksettings->base.phy_address - = legacy_settings->phy_address; - link_ksettings->base.autoneg - = legacy_settings->autoneg; - link_ksettings->base.mdio_support - = legacy_settings->mdio_support; - link_ksettings->base.eth_tp_mdix - = legacy_settings->eth_tp_mdix; - link_ksettings->base.eth_tp_mdix_ctrl - = legacy_settings->eth_tp_mdix_ctrl; - return retval; -} - -/* return false if ksettings link modes had higher bits - * set. legacy_settings always updated (best effort) - */ -static bool -convert_link_ksettings_to_legacy_settings( - struct ethtool_cmd *legacy_settings, - const struct ethtool_link_ksettings *link_ksettings) -{ - bool retval = true; - - memset(legacy_settings, 0, sizeof(*legacy_settings)); - /* this also clears the deprecated fields in legacy structure: - * __u8 transceiver; - * __u32 maxtxpkt; - * __u32 maxrxpkt; - */ - - retval &= ethtool_convert_link_mode_to_legacy_u32( - &legacy_settings->supported, - link_ksettings->link_modes.supported); - retval &= ethtool_convert_link_mode_to_legacy_u32( - &legacy_settings->advertising, - link_ksettings->link_modes.advertising); - retval &= ethtool_convert_link_mode_to_legacy_u32( - &legacy_settings->lp_advertising, - link_ksettings->link_modes.lp_advertising); - ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); - legacy_settings->duplex - = link_ksettings->base.duplex; - legacy_settings->port - = link_ksettings->base.port; - legacy_settings->phy_address - = link_ksettings->base.phy_address; - legacy_settings->autoneg - = link_ksettings->base.autoneg; - legacy_settings->mdio_support - = link_ksettings->base.mdio_support; - legacy_settings->eth_tp_mdix - = link_ksettings->base.eth_tp_mdix; - legacy_settings->eth_tp_mdix_ctrl - = link_ksettings->base.eth_tp_mdix_ctrl; - legacy_settings->transceiver - = link_ksettings->base.transceiver; - return retval; -} - -/* number of 32-bit words to store the user's link mode bitmaps */ -#define __ETHTOOL_LINK_MODE_MASK_NU32 \ - DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) - -/* layout of the struct passed from/to userland */ -struct ethtool_link_usettings { - struct ethtool_link_settings base; - struct { - __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; - __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; - __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; - } link_modes; -}; - -/* Internal kernel helper to query a device ethtool_link_settings. */ -int __ethtool_get_link_ksettings(struct net_device *dev, - struct ethtool_link_ksettings *link_ksettings) -{ - ASSERT_RTNL(); - - if (!dev->ethtool_ops->get_link_ksettings) - return -EOPNOTSUPP; - - memset(link_ksettings, 0, sizeof(*link_ksettings)); - return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); -} -EXPORT_SYMBOL(__ethtool_get_link_ksettings); - -/* convert ethtool_link_usettings in user space to a kernel internal - * ethtool_link_ksettings. return 0 on success, errno on error. - */ -static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, - const void __user *from) -{ - struct ethtool_link_usettings link_usettings; - - if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) - return -EFAULT; - - memcpy(&to->base, &link_usettings.base, sizeof(to->base)); - bitmap_from_arr32(to->link_modes.supported, - link_usettings.link_modes.supported, - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_from_arr32(to->link_modes.advertising, - link_usettings.link_modes.advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_from_arr32(to->link_modes.lp_advertising, - link_usettings.link_modes.lp_advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS); - - return 0; -} - -/* convert a kernel internal ethtool_link_ksettings to - * ethtool_link_usettings in user space. return 0 on success, errno on - * error. - */ -static int -store_link_ksettings_for_user(void __user *to, - const struct ethtool_link_ksettings *from) -{ - struct ethtool_link_usettings link_usettings; - - memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); - bitmap_to_arr32(link_usettings.link_modes.supported, - from->link_modes.supported, - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_to_arr32(link_usettings.link_modes.advertising, - from->link_modes.advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS); - bitmap_to_arr32(link_usettings.link_modes.lp_advertising, - from->link_modes.lp_advertising, - __ETHTOOL_LINK_MODE_MASK_NBITS); - - if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) - return -EFAULT; - - return 0; -} - -/* Query device for its ethtool_link_settings. */ -static int ethtool_get_link_ksettings(struct net_device *dev, - void __user *useraddr) -{ - int err = 0; - struct ethtool_link_ksettings link_ksettings; - - ASSERT_RTNL(); - if (!dev->ethtool_ops->get_link_ksettings) - return -EOPNOTSUPP; - - /* handle bitmap nbits handshake */ - if (copy_from_user(&link_ksettings.base, useraddr, - sizeof(link_ksettings.base))) - return -EFAULT; - - if (__ETHTOOL_LINK_MODE_MASK_NU32 - != link_ksettings.base.link_mode_masks_nwords) { - /* wrong link mode nbits requested */ - memset(&link_ksettings, 0, sizeof(link_ksettings)); - link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; - /* send back number of words required as negative val */ - compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, - "need too many bits for link modes!"); - link_ksettings.base.link_mode_masks_nwords - = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); - - /* copy the base fields back to user, not the link - * mode bitmaps - */ - if (copy_to_user(useraddr, &link_ksettings.base, - sizeof(link_ksettings.base))) - return -EFAULT; - - return 0; - } - - /* handshake successful: user/kernel agree on - * link_mode_masks_nwords - */ - - memset(&link_ksettings, 0, sizeof(link_ksettings)); - err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); - if (err < 0) - return err; - - /* make sure we tell the right values to user */ - link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; - link_ksettings.base.link_mode_masks_nwords - = __ETHTOOL_LINK_MODE_MASK_NU32; - - return store_link_ksettings_for_user(useraddr, &link_ksettings); -} - -/* Update device ethtool_link_settings. */ -static int ethtool_set_link_ksettings(struct net_device *dev, - void __user *useraddr) -{ - int err; - struct ethtool_link_ksettings link_ksettings; - - ASSERT_RTNL(); - - if (!dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; - - /* make sure nbits field has expected value */ - if (copy_from_user(&link_ksettings.base, useraddr, - sizeof(link_ksettings.base))) - return -EFAULT; - - if (__ETHTOOL_LINK_MODE_MASK_NU32 - != link_ksettings.base.link_mode_masks_nwords) - return -EINVAL; - - /* copy the whole structure, now that we know it has expected - * format - */ - err = load_link_ksettings_from_user(&link_ksettings, useraddr); - if (err) - return err; - - /* re-check nwords field, just in case */ - if (__ETHTOOL_LINK_MODE_MASK_NU32 - != link_ksettings.base.link_mode_masks_nwords) - return -EINVAL; - - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); -} - -/* Query device for its ethtool_cmd settings. - * - * Backward compatibility note: for compatibility with legacy ethtool, this is - * now implemented via get_link_ksettings. When driver reports higher link mode - * bits, a kernel warning is logged once (with name of 1st driver/device) to - * recommend user to upgrade ethtool, but the command is successful (only the - * lower link mode bits reported back to user). Deprecated fields from - * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. - */ -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_link_ksettings link_ksettings; - struct ethtool_cmd cmd; - int err; - - ASSERT_RTNL(); - if (!dev->ethtool_ops->get_link_ksettings) - return -EOPNOTSUPP; - - memset(&link_ksettings, 0, sizeof(link_ksettings)); - err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); - if (err < 0) - return err; - convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); - - /* send a sensible cmd tag back to user */ - cmd.cmd = ETHTOOL_GSET; - - if (copy_to_user(useraddr, &cmd, sizeof(cmd))) - return -EFAULT; - - return 0; -} - -/* Update device link settings with given ethtool_cmd. - * - * Backward compatibility note: for compatibility with legacy ethtool, this is - * now always implemented via set_link_settings. When user's request updates - * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel - * warning is logged once (with name of 1st driver/device) to recommend user to - * upgrade ethtool, and the request is rejected. - */ -static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_link_ksettings link_ksettings; - struct ethtool_cmd cmd; - - ASSERT_RTNL(); - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - if (!dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; - - if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) - return -EINVAL; - link_ksettings.base.link_mode_masks_nwords = - __ETHTOOL_LINK_MODE_MASK_NU32; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); -} - -static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_drvinfo info; - const struct ethtool_ops *ops = dev->ethtool_ops; - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GDRVINFO; - if (ops->get_drvinfo) { - ops->get_drvinfo(dev, &info); - } else if (dev->dev.parent && dev->dev.parent->driver) { - strlcpy(info.bus_info, dev_name(dev->dev.parent), - sizeof(info.bus_info)); - strlcpy(info.driver, dev->dev.parent->driver->name, - sizeof(info.driver)); - } else { - return -EOPNOTSUPP; - } - - /* - * this method of obtaining string set info is deprecated; - * Use ETHTOOL_GSSET_INFO instead. - */ - if (ops->get_sset_count) { - int rc; - - rc = ops->get_sset_count(dev, ETH_SS_TEST); - if (rc >= 0) - info.testinfo_len = rc; - rc = ops->get_sset_count(dev, ETH_SS_STATS); - if (rc >= 0) - info.n_stats = rc; - rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); - if (rc >= 0) - info.n_priv_flags = rc; - } - if (ops->get_regs_len) { - int ret = ops->get_regs_len(dev); - - if (ret > 0) - info.regdump_len = ret; - } - - if (ops->get_eeprom_len) - info.eedump_len = ops->get_eeprom_len(dev); - - if (!info.fw_version[0]) - devlink_compat_running_version(dev, info.fw_version, - sizeof(info.fw_version)); - - if (copy_to_user(useraddr, &info, sizeof(info))) - return -EFAULT; - return 0; -} - -static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_sset_info info; - u64 sset_mask; - int i, idx = 0, n_bits = 0, ret, rc; - u32 *info_buf = NULL; - - if (copy_from_user(&info, useraddr, sizeof(info))) - return -EFAULT; - - /* store copy of mask, because we zero struct later on */ - sset_mask = info.sset_mask; - if (!sset_mask) - return 0; - - /* calculate size of return buffer */ - n_bits = hweight64(sset_mask); - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GSSET_INFO; - - info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); - if (!info_buf) - return -ENOMEM; - - /* - * fill return buffer based on input bitmask and successful - * get_sset_count return - */ - for (i = 0; i < 64; i++) { - if (!(sset_mask & (1ULL << i))) - continue; - - rc = __ethtool_get_sset_count(dev, i); - if (rc >= 0) { - info.sset_mask |= (1ULL << i); - info_buf[idx++] = rc; - } - } - - ret = -EFAULT; - if (copy_to_user(useraddr, &info, sizeof(info))) - goto out; - - useraddr += offsetof(struct ethtool_sset_info, data); - if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) - goto out; - - ret = 0; - -out: - kfree(info_buf); - return ret; -} - -static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, - u32 cmd, void __user *useraddr) -{ - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); - int rc; - - if (!dev->ethtool_ops->set_rxnfc) - return -EOPNOTSUPP; - - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_SRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - rc = dev->ethtool_ops->set_rxnfc(dev, &info); - if (rc) - return rc; - - if (cmd == ETHTOOL_SRXCLSRLINS && - copy_to_user(useraddr, &info, info_size)) - return -EFAULT; - - return 0; -} - -static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, - u32 cmd, void __user *useraddr) -{ - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); - const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; - void *rule_buf = NULL; - - if (!ops->get_rxnfc) - return -EOPNOTSUPP; - - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_GRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - /* If FLOW_RSS was requested then user-space must be using the - * new definition, as FLOW_RSS is newer. - */ - if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { - info_size = sizeof(info); - if (copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - /* Since malicious users may modify the original data, - * we need to check whether FLOW_RSS is still requested. - */ - if (!(info.flow_type & FLOW_RSS)) - return -EINVAL; - } - - if (info.cmd != cmd) - return -EINVAL; - - if (info.cmd == ETHTOOL_GRXCLSRLALL) { - if (info.rule_cnt > 0) { - if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) - rule_buf = kcalloc(info.rule_cnt, sizeof(u32), - GFP_USER); - if (!rule_buf) - return -ENOMEM; - } - } - - ret = ops->get_rxnfc(dev, &info, rule_buf); - if (ret < 0) - goto err_out; - - ret = -EFAULT; - if (copy_to_user(useraddr, &info, info_size)) - goto err_out; - - if (rule_buf) { - useraddr += offsetof(struct ethtool_rxnfc, rule_locs); - if (copy_to_user(useraddr, rule_buf, - info.rule_cnt * sizeof(u32))) - goto err_out; - } - ret = 0; - -err_out: - kfree(rule_buf); - - return ret; -} - -static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, - struct ethtool_rxnfc *rx_rings, - u32 size) -{ - int i; - - if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) - return -EFAULT; - - /* Validate ring indices */ - for (i = 0; i < size; i++) - if (indir[i] >= rx_rings->data) - return -EINVAL; - - return 0; -} - -u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; - -void netdev_rss_key_fill(void *buffer, size_t len) -{ - BUG_ON(len > sizeof(netdev_rss_key)); - net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); - memcpy(buffer, netdev_rss_key, len); -} -EXPORT_SYMBOL(netdev_rss_key_fill); - -static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) -{ - u32 dev_size, current_max = 0; - u32 *indir; - int ret; - - if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->get_rxfh) - return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); - if (dev_size == 0) - return -EOPNOTSUPP; - - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) - return -ENOMEM; - - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); - if (ret) - goto out; - - while (dev_size--) - current_max = max(current_max, indir[dev_size]); - - *max = current_max; - -out: - kfree(indir); - return ret; -} - -static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, - void __user *useraddr) -{ - u32 user_size, dev_size; - u32 *indir; - int ret; - - if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->get_rxfh) - return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); - if (dev_size == 0) - return -EOPNOTSUPP; - - if (copy_from_user(&user_size, - useraddr + offsetof(struct ethtool_rxfh_indir, size), - sizeof(user_size))) - return -EFAULT; - - if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), - &dev_size, sizeof(dev_size))) - return -EFAULT; - - /* If the user buffer size is 0, this is just a query for the - * device table size. Otherwise, if it's smaller than the - * device table size it's an error. - */ - if (user_size < dev_size) - return user_size == 0 ? 0 : -EINVAL; - - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) - return -ENOMEM; - - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); - if (ret) - goto out; - - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh_indir, ring_index[0]), - indir, dev_size * sizeof(indir[0]))) - ret = -EFAULT; - -out: - kfree(indir); - return ret; -} - -static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_rxnfc rx_rings; - u32 user_size, dev_size, i; - u32 *indir; - const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; - u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); - - if (!ops->get_rxfh_indir_size || !ops->set_rxfh || - !ops->get_rxnfc) - return -EOPNOTSUPP; - - dev_size = ops->get_rxfh_indir_size(dev); - if (dev_size == 0) - return -EOPNOTSUPP; - - if (copy_from_user(&user_size, - useraddr + offsetof(struct ethtool_rxfh_indir, size), - sizeof(user_size))) - return -EFAULT; - - if (user_size != 0 && user_size != dev_size) - return -EINVAL; - - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) - return -ENOMEM; - - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) - goto out; - - if (user_size == 0) { - for (i = 0; i < dev_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); - } else { - ret = ethtool_copy_validate_indir(indir, - useraddr + ringidx_offset, - &rx_rings, - dev_size); - if (ret) - goto out; - } - - ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); - if (ret) - goto out; - - /* indicate whether rxfh was set to default */ - if (user_size == 0) - dev->priv_flags &= ~IFF_RXFH_CONFIGURED; - else - dev->priv_flags |= IFF_RXFH_CONFIGURED; - -out: - kfree(indir); - return ret; -} - -static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, - void __user *useraddr) -{ - int ret; - const struct ethtool_ops *ops = dev->ethtool_ops; - u32 user_indir_size, user_key_size; - u32 dev_indir_size = 0, dev_key_size = 0; - struct ethtool_rxfh rxfh; - u32 total_size; - u32 indir_bytes; - u32 *indir = NULL; - u8 dev_hfunc = 0; - u8 *hkey = NULL; - u8 *rss_config; - - if (!ops->get_rxfh) - return -EOPNOTSUPP; - - if (ops->get_rxfh_indir_size) - dev_indir_size = ops->get_rxfh_indir_size(dev); - if (ops->get_rxfh_key_size) - dev_key_size = ops->get_rxfh_key_size(dev); - - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) - return -EFAULT; - user_indir_size = rxfh.indir_size; - user_key_size = rxfh.key_size; - - /* Check that reserved fields are 0 for now */ - if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) - return -EINVAL; - /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !ops->get_rxfh_context) - return -EOPNOTSUPP; - - rxfh.indir_size = dev_indir_size; - rxfh.key_size = dev_key_size; - if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) - return -EFAULT; - - if ((user_indir_size && (user_indir_size != dev_indir_size)) || - (user_key_size && (user_key_size != dev_key_size))) - return -EINVAL; - - indir_bytes = user_indir_size * sizeof(indir[0]); - total_size = indir_bytes + user_key_size; - rss_config = kzalloc(total_size, GFP_USER); - if (!rss_config) - return -ENOMEM; - - if (user_indir_size) - indir = (u32 *)rss_config; - - if (user_key_size) - hkey = rss_config + indir_bytes; - - if (rxfh.rss_context) - ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, - &dev_hfunc, - rxfh.rss_context); - else - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); - if (ret) - goto out; - - if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), - &dev_hfunc, sizeof(rxfh.hfunc))) { - ret = -EFAULT; - } else if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh, rss_config[0]), - rss_config, total_size)) { - ret = -EFAULT; - } -out: - kfree(rss_config); - - return ret; -} - -static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, - void __user *useraddr) -{ - int ret; - const struct ethtool_ops *ops = dev->ethtool_ops; - struct ethtool_rxnfc rx_rings; - struct ethtool_rxfh rxfh; - u32 dev_indir_size = 0, dev_key_size = 0, i; - u32 *indir = NULL, indir_bytes = 0; - u8 *hkey = NULL; - u8 *rss_config; - u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - bool delete = false; - - if (!ops->get_rxnfc || !ops->set_rxfh) - return -EOPNOTSUPP; - - if (ops->get_rxfh_indir_size) - dev_indir_size = ops->get_rxfh_indir_size(dev); - if (ops->get_rxfh_key_size) - dev_key_size = ops->get_rxfh_key_size(dev); - - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) - return -EFAULT; - - /* Check that reserved fields are 0 for now */ - if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) - return -EINVAL; - /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !ops->set_rxfh_context) - return -EOPNOTSUPP; - - /* If either indir, hash key or function is valid, proceed further. - * Must request at least one change: indir size, hash key or function. - */ - if ((rxfh.indir_size && - rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && - rxfh.indir_size != dev_indir_size) || - (rxfh.key_size && (rxfh.key_size != dev_key_size)) || - (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) - return -EINVAL; - - if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - indir_bytes = dev_indir_size * sizeof(indir[0]); - - rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); - if (!rss_config) - return -ENOMEM; - - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) - goto out; - - /* rxfh.indir_size == 0 means reset the indir table to default (master - * context) or delete the context (other RSS contexts). - * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. - */ - if (rxfh.indir_size && - rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { - indir = (u32 *)rss_config; - ret = ethtool_copy_validate_indir(indir, - useraddr + rss_cfg_offset, - &rx_rings, - rxfh.indir_size); - if (ret) - goto out; - } else if (rxfh.indir_size == 0) { - if (rxfh.rss_context == 0) { - indir = (u32 *)rss_config; - for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); - } else { - delete = true; - } - } - - if (rxfh.key_size) { - hkey = rss_config + indir_bytes; - if (copy_from_user(hkey, - useraddr + rss_cfg_offset + indir_bytes, - rxfh.key_size)) { - ret = -EFAULT; - goto out; - } - } - - if (rxfh.rss_context) - ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, - &rxfh.rss_context, delete); - else - ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); - if (ret) - goto out; - - if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), - &rxfh.rss_context, sizeof(rxfh.rss_context))) - ret = -EFAULT; - - if (!rxfh.rss_context) { - /* indicate whether rxfh was set to default */ - if (rxfh.indir_size == 0) - dev->priv_flags &= ~IFF_RXFH_CONFIGURED; - else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - dev->priv_flags |= IFF_RXFH_CONFIGURED; - } - -out: - kfree(rss_config); - return ret; -} - -static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_regs regs; - const struct ethtool_ops *ops = dev->ethtool_ops; - void *regbuf; - int reglen, ret; - - if (!ops->get_regs || !ops->get_regs_len) - return -EOPNOTSUPP; - - if (copy_from_user(®s, useraddr, sizeof(regs))) - return -EFAULT; - - reglen = ops->get_regs_len(dev); - if (reglen <= 0) - return reglen; - - if (regs.len > reglen) - regs.len = reglen; - - regbuf = vzalloc(reglen); - if (!regbuf) - return -ENOMEM; - - if (regs.len < reglen) - reglen = regs.len; - - ops->get_regs(dev, ®s, regbuf); - - ret = -EFAULT; - if (copy_to_user(useraddr, ®s, sizeof(regs))) - goto out; - useraddr += offsetof(struct ethtool_regs, data); - if (copy_to_user(useraddr, regbuf, reglen)) - goto out; - ret = 0; - - out: - vfree(regbuf); - return ret; -} - -static int ethtool_reset(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value reset; - int ret; - - if (!dev->ethtool_ops->reset) - return -EOPNOTSUPP; - - if (copy_from_user(&reset, useraddr, sizeof(reset))) - return -EFAULT; - - ret = dev->ethtool_ops->reset(dev, &reset.data); - if (ret) - return ret; - - if (copy_to_user(useraddr, &reset, sizeof(reset))) - return -EFAULT; - return 0; -} - -static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_wolinfo wol; - - if (!dev->ethtool_ops->get_wol) - return -EOPNOTSUPP; - - memset(&wol, 0, sizeof(struct ethtool_wolinfo)); - wol.cmd = ETHTOOL_GWOL; - dev->ethtool_ops->get_wol(dev, &wol); - - if (copy_to_user(useraddr, &wol, sizeof(wol))) - return -EFAULT; - return 0; -} - -static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_wolinfo wol; - int ret; - - if (!dev->ethtool_ops->set_wol) - return -EOPNOTSUPP; - - if (copy_from_user(&wol, useraddr, sizeof(wol))) - return -EFAULT; - - ret = dev->ethtool_ops->set_wol(dev, &wol); - if (ret) - return ret; - - dev->wol_enabled = !!wol.wolopts; - - return 0; -} - -static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_eee edata; - int rc; - - if (!dev->ethtool_ops->get_eee) - return -EOPNOTSUPP; - - memset(&edata, 0, sizeof(struct ethtool_eee)); - edata.cmd = ETHTOOL_GEEE; - rc = dev->ethtool_ops->get_eee(dev, &edata); - - if (rc) - return rc; - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - - return 0; -} - -static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_eee edata; - - if (!dev->ethtool_ops->set_eee) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - return dev->ethtool_ops->set_eee(dev, &edata); -} - -static int ethtool_nway_reset(struct net_device *dev) -{ - if (!dev->ethtool_ops->nway_reset) - return -EOPNOTSUPP; - - return dev->ethtool_ops->nway_reset(dev); -} - -static int ethtool_get_link(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; - - if (!dev->ethtool_ops->get_link) - return -EOPNOTSUPP; - - edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, - int (*getter)(struct net_device *, - struct ethtool_eeprom *, u8 *), - u32 total_len) -{ - struct ethtool_eeprom eeprom; - void __user *userbuf = useraddr + sizeof(eeprom); - u32 bytes_remaining; - u8 *data; - int ret = 0; - - if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) - return -EFAULT; - - /* Check for wrap and zero */ - if (eeprom.offset + eeprom.len <= eeprom.offset) - return -EINVAL; - - /* Check for exceeding total eeprom len */ - if (eeprom.offset + eeprom.len > total_len) - return -EINVAL; - - data = kmalloc(PAGE_SIZE, GFP_USER); - if (!data) - return -ENOMEM; - - bytes_remaining = eeprom.len; - while (bytes_remaining > 0) { - eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); - - ret = getter(dev, &eeprom, data); - if (ret) - break; - if (copy_to_user(userbuf, data, eeprom.len)) { - ret = -EFAULT; - break; - } - userbuf += eeprom.len; - eeprom.offset += eeprom.len; - bytes_remaining -= eeprom.len; - } - - eeprom.len = userbuf - (useraddr + sizeof(eeprom)); - eeprom.offset -= eeprom.len; - if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) - ret = -EFAULT; - - kfree(data); - return ret; -} - -static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->get_eeprom || !ops->get_eeprom_len || - !ops->get_eeprom_len(dev)) - return -EOPNOTSUPP; - - return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, - ops->get_eeprom_len(dev)); -} - -static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_eeprom eeprom; - const struct ethtool_ops *ops = dev->ethtool_ops; - void __user *userbuf = useraddr + sizeof(eeprom); - u32 bytes_remaining; - u8 *data; - int ret = 0; - - if (!ops->set_eeprom || !ops->get_eeprom_len || - !ops->get_eeprom_len(dev)) - return -EOPNOTSUPP; - - if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) - return -EFAULT; - - /* Check for wrap and zero */ - if (eeprom.offset + eeprom.len <= eeprom.offset) - return -EINVAL; - - /* Check for exceeding total eeprom len */ - if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) - return -EINVAL; - - data = kmalloc(PAGE_SIZE, GFP_USER); - if (!data) - return -ENOMEM; - - bytes_remaining = eeprom.len; - while (bytes_remaining > 0) { - eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); - - if (copy_from_user(data, userbuf, eeprom.len)) { - ret = -EFAULT; - break; - } - ret = ops->set_eeprom(dev, &eeprom, data); - if (ret) - break; - userbuf += eeprom.len; - eeprom.offset += eeprom.len; - bytes_remaining -= eeprom.len; - } - - kfree(data); - return ret; -} - -static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; - - if (!dev->ethtool_ops->get_coalesce) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_coalesce(dev, &coalesce); - - if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) - return -EFAULT; - return 0; -} - -static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_coalesce coalesce; - - if (!dev->ethtool_ops->set_coalesce) - return -EOPNOTSUPP; - - if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) - return -EFAULT; - - return dev->ethtool_ops->set_coalesce(dev, &coalesce); -} - -static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; - - if (!dev->ethtool_ops->get_ringparam) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_ringparam(dev, &ringparam); - - if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) - return -EFAULT; - return 0; -} - -static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; - - if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) - return -EOPNOTSUPP; - - if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) - return -EFAULT; - - dev->ethtool_ops->get_ringparam(dev, &max); - - /* ensure new ring parameters are within the maximums */ - if (ringparam.rx_pending > max.rx_max_pending || - ringparam.rx_mini_pending > max.rx_mini_max_pending || - ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || - ringparam.tx_pending > max.tx_max_pending) - return -EINVAL; - - return dev->ethtool_ops->set_ringparam(dev, &ringparam); -} - -static noinline_for_stack int ethtool_get_channels(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; - - if (!dev->ethtool_ops->get_channels) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_channels(dev, &channels); - - if (copy_to_user(useraddr, &channels, sizeof(channels))) - return -EFAULT; - return 0; -} - -static noinline_for_stack int ethtool_set_channels(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; - u16 from_channel, to_channel; - u32 max_rx_in_use = 0; - unsigned int i; - - if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) - return -EOPNOTSUPP; - - if (copy_from_user(&channels, useraddr, sizeof(channels))) - return -EFAULT; - - dev->ethtool_ops->get_channels(dev, &curr); - - /* ensure new counts are within the maximums */ - if (channels.rx_count > curr.max_rx || - channels.tx_count > curr.max_tx || - channels.combined_count > curr.max_combined || - channels.other_count > curr.max_other) - return -EINVAL; - - /* ensure the new Rx count fits within the configured Rx flow - * indirection table settings */ - if (netif_is_rxfh_configured(dev) && - !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && - (channels.combined_count + channels.rx_count) <= max_rx_in_use) - return -EINVAL; - - /* Disabling channels, query zero-copy AF_XDP sockets */ - from_channel = channels.combined_count + - min(channels.rx_count, channels.tx_count); - to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); - for (i = from_channel; i < to_channel; i++) - if (xdp_get_umem_from_qid(dev, i)) - return -EINVAL; - - return dev->ethtool_ops->set_channels(dev, &channels); -} - -static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM }; - - if (!dev->ethtool_ops->get_pauseparam) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_pauseparam(dev, &pauseparam); - - if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) - return -EFAULT; - return 0; -} - -static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_pauseparam pauseparam; - - if (!dev->ethtool_ops->set_pauseparam) - return -EOPNOTSUPP; - - if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) - return -EFAULT; - - return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); -} - -static int ethtool_self_test(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_test test; - const struct ethtool_ops *ops = dev->ethtool_ops; - u64 *data; - int ret, test_len; - - if (!ops->self_test || !ops->get_sset_count) - return -EOPNOTSUPP; - - test_len = ops->get_sset_count(dev, ETH_SS_TEST); - if (test_len < 0) - return test_len; - WARN_ON(test_len == 0); - - if (copy_from_user(&test, useraddr, sizeof(test))) - return -EFAULT; - - test.len = test_len; - data = kmalloc_array(test_len, sizeof(u64), GFP_USER); - if (!data) - return -ENOMEM; - - ops->self_test(dev, &test, data); - - ret = -EFAULT; - if (copy_to_user(useraddr, &test, sizeof(test))) - goto out; - useraddr += sizeof(test); - if (copy_to_user(useraddr, data, test.len * sizeof(u64))) - goto out; - ret = 0; - - out: - kfree(data); - return ret; -} - -static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_gstrings gstrings; - u8 *data; - int ret; - - if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) - return -EFAULT; - - ret = __ethtool_get_sset_count(dev, gstrings.string_set); - if (ret < 0) - return ret; - if (ret > S32_MAX / ETH_GSTRING_LEN) - return -ENOMEM; - WARN_ON_ONCE(!ret); - - gstrings.len = ret; - - if (gstrings.len) { - data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); - if (!data) - return -ENOMEM; - - __ethtool_get_strings(dev, gstrings.string_set, data); - } else { - data = NULL; - } - - ret = -EFAULT; - if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) - goto out; - useraddr += sizeof(gstrings); - if (gstrings.len && - copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) - goto out; - ret = 0; - -out: - vfree(data); - return ret; -} - -static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_value id; - static bool busy; - const struct ethtool_ops *ops = dev->ethtool_ops; - int rc; - - if (!ops->set_phys_id) - return -EOPNOTSUPP; - - if (busy) - return -EBUSY; - - if (copy_from_user(&id, useraddr, sizeof(id))) - return -EFAULT; - - rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); - if (rc < 0) - return rc; - - /* Drop the RTNL lock while waiting, but prevent reentry or - * removal of the device. - */ - busy = true; - dev_hold(dev); - rtnl_unlock(); - - if (rc == 0) { - /* Driver will handle this itself */ - schedule_timeout_interruptible( - id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); - } else { - /* Driver expects to be called at twice the frequency in rc */ - int n = rc * 2, i, interval = HZ / n; - - /* Count down seconds */ - do { - /* Count down iterations per second */ - i = n; - do { - rtnl_lock(); - rc = ops->set_phys_id(dev, - (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); - rtnl_unlock(); - if (rc) - break; - schedule_timeout_interruptible(interval); - } while (!signal_pending(current) && --i != 0); - } while (!signal_pending(current) && - (id.data == 0 || --id.data != 0)); - } - - rtnl_lock(); - dev_put(dev); - busy = false; - - (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); - return rc; -} - -static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_stats stats; - const struct ethtool_ops *ops = dev->ethtool_ops; - u64 *data; - int ret, n_stats; - - if (!ops->get_ethtool_stats || !ops->get_sset_count) - return -EOPNOTSUPP; - - n_stats = ops->get_sset_count(dev, ETH_SS_STATS); - if (n_stats < 0) - return n_stats; - if (n_stats > S32_MAX / sizeof(u64)) - return -ENOMEM; - WARN_ON_ONCE(!n_stats); - if (copy_from_user(&stats, useraddr, sizeof(stats))) - return -EFAULT; - - stats.n_stats = n_stats; - - if (n_stats) { - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; - ops->get_ethtool_stats(dev, &stats, data); - } else { - data = NULL; - } - - ret = -EFAULT; - if (copy_to_user(useraddr, &stats, sizeof(stats))) - goto out; - useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) - goto out; - ret = 0; - - out: - vfree(data); - return ret; -} - -static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - struct ethtool_stats stats; - u64 *data; - int ret, n_stats; - - if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) - return -EOPNOTSUPP; - - if (dev->phydev && !ops->get_ethtool_phy_stats) - n_stats = phy_ethtool_get_sset_count(dev->phydev); - else - n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); - if (n_stats < 0) - return n_stats; - if (n_stats > S32_MAX / sizeof(u64)) - return -ENOMEM; - WARN_ON_ONCE(!n_stats); - - if (copy_from_user(&stats, useraddr, sizeof(stats))) - return -EFAULT; - - stats.n_stats = n_stats; - - if (n_stats) { - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (!data) - return -ENOMEM; - - if (dev->phydev && !ops->get_ethtool_phy_stats) { - ret = phy_ethtool_get_stats(dev->phydev, &stats, data); - if (ret < 0) - goto out; - } else { - ops->get_ethtool_phy_stats(dev, &stats, data); - } - } else { - data = NULL; - } - - ret = -EFAULT; - if (copy_to_user(useraddr, &stats, sizeof(stats))) - goto out; - useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) - goto out; - ret = 0; - - out: - vfree(data); - return ret; -} - -static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_perm_addr epaddr; - - if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) - return -EFAULT; - - if (epaddr.size < dev->addr_len) - return -ETOOSMALL; - epaddr.size = dev->addr_len; - - if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) - return -EFAULT; - useraddr += sizeof(epaddr); - if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) - return -EFAULT; - return 0; -} - -static int ethtool_get_value(struct net_device *dev, char __user *useraddr, - u32 cmd, u32 (*actor)(struct net_device *)) -{ - struct ethtool_value edata = { .cmd = cmd }; - - if (!actor) - return -EOPNOTSUPP; - - edata.data = actor(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, - void (*actor)(struct net_device *, u32)) -{ - struct ethtool_value edata; - - if (!actor) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - actor(dev, edata.data); - return 0; -} - -static int ethtool_set_value(struct net_device *dev, char __user *useraddr, - int (*actor)(struct net_device *, u32)) -{ - struct ethtool_value edata; - - if (!actor) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - return actor(dev, edata.data); -} - -static noinline_for_stack int ethtool_flash_device(struct net_device *dev, - char __user *useraddr) -{ - struct ethtool_flash efl; - - if (copy_from_user(&efl, useraddr, sizeof(efl))) - return -EFAULT; - efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; - - if (!dev->ethtool_ops->flash_device) - return devlink_compat_flash_update(dev, efl.data); - - return dev->ethtool_ops->flash_device(dev, &efl); -} - -static int ethtool_set_dump(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_dump dump; - - if (!dev->ethtool_ops->set_dump) - return -EOPNOTSUPP; - - if (copy_from_user(&dump, useraddr, sizeof(dump))) - return -EFAULT; - - return dev->ethtool_ops->set_dump(dev, &dump); -} - -static int ethtool_get_dump_flag(struct net_device *dev, - void __user *useraddr) -{ - int ret; - struct ethtool_dump dump; - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->get_dump_flag) - return -EOPNOTSUPP; - - if (copy_from_user(&dump, useraddr, sizeof(dump))) - return -EFAULT; - - ret = ops->get_dump_flag(dev, &dump); - if (ret) - return ret; - - if (copy_to_user(useraddr, &dump, sizeof(dump))) - return -EFAULT; - return 0; -} - -static int ethtool_get_dump_data(struct net_device *dev, - void __user *useraddr) -{ - int ret; - __u32 len; - struct ethtool_dump dump, tmp; - const struct ethtool_ops *ops = dev->ethtool_ops; - void *data = NULL; - - if (!ops->get_dump_data || !ops->get_dump_flag) - return -EOPNOTSUPP; - - if (copy_from_user(&dump, useraddr, sizeof(dump))) - return -EFAULT; - - memset(&tmp, 0, sizeof(tmp)); - tmp.cmd = ETHTOOL_GET_DUMP_FLAG; - ret = ops->get_dump_flag(dev, &tmp); - if (ret) - return ret; - - len = min(tmp.len, dump.len); - if (!len) - return -EFAULT; - - /* Don't ever let the driver think there's more space available - * than it requested with .get_dump_flag(). - */ - dump.len = len; - - /* Always allocate enough space to hold the whole thing so that the - * driver does not need to check the length and bother with partial - * dumping. - */ - data = vzalloc(tmp.len); - if (!data) - return -ENOMEM; - ret = ops->get_dump_data(dev, &dump, data); - if (ret) - goto out; - - /* There are two sane possibilities: - * 1. The driver's .get_dump_data() does not touch dump.len. - * 2. Or it may set dump.len to how much it really writes, which - * should be tmp.len (or len if it can do a partial dump). - * In any case respond to userspace with the actual length of data - * it's receiving. - */ - WARN_ON(dump.len != len && dump.len != tmp.len); - dump.len = len; - - if (copy_to_user(useraddr, &dump, sizeof(dump))) { - ret = -EFAULT; - goto out; - } - useraddr += offsetof(struct ethtool_dump, data); - if (copy_to_user(useraddr, data, len)) - ret = -EFAULT; -out: - vfree(data); - return ret; -} - -static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) -{ - int err = 0; - struct ethtool_ts_info info; - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GET_TS_INFO; - - if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); - } else if (ops->get_ts_info) { - err = ops->get_ts_info(dev, &info); - } else { - info.so_timestamping = - SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info.phc_index = -1; - } - - if (err) - return err; - - if (copy_to_user(useraddr, &info, sizeof(info))) - err = -EFAULT; - - return err; -} - -static int __ethtool_get_module_info(struct net_device *dev, - struct ethtool_modinfo *modinfo) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - - if (dev->sfp_bus) - return sfp_get_module_info(dev->sfp_bus, modinfo); - - if (phydev && phydev->drv && phydev->drv->module_info) - return phydev->drv->module_info(phydev, modinfo); - - if (ops->get_module_info) - return ops->get_module_info(dev, modinfo); - - return -EOPNOTSUPP; -} - -static int ethtool_get_module_info(struct net_device *dev, - void __user *useraddr) -{ - int ret; - struct ethtool_modinfo modinfo; - - if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) - return -EFAULT; - - ret = __ethtool_get_module_info(dev, &modinfo); - if (ret) - return ret; - - if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) - return -EFAULT; - - return 0; -} - -static int __ethtool_get_module_eeprom(struct net_device *dev, - struct ethtool_eeprom *ee, u8 *data) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - - if (dev->sfp_bus) - return sfp_get_module_eeprom(dev->sfp_bus, ee, data); - - if (phydev && phydev->drv && phydev->drv->module_eeprom) - return phydev->drv->module_eeprom(phydev, ee, data); - - if (ops->get_module_eeprom) - return ops->get_module_eeprom(dev, ee, data); - - return -EOPNOTSUPP; -} - -static int ethtool_get_module_eeprom(struct net_device *dev, - void __user *useraddr) -{ - int ret; - struct ethtool_modinfo modinfo; - - ret = __ethtool_get_module_info(dev, &modinfo); - if (ret) - return ret; - - return ethtool_get_any_eeprom(dev, useraddr, - __ethtool_get_module_eeprom, - modinfo.eeprom_len); -} - -static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) -{ - switch (tuna->id) { - case ETHTOOL_RX_COPYBREAK: - case ETHTOOL_TX_COPYBREAK: - if (tuna->len != sizeof(u32) || - tuna->type_id != ETHTOOL_TUNABLE_U32) - return -EINVAL; - break; - case ETHTOOL_PFC_PREVENTION_TOUT: - if (tuna->len != sizeof(u16) || - tuna->type_id != ETHTOOL_TUNABLE_U16) - return -EINVAL; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) -{ - int ret; - struct ethtool_tunable tuna; - const struct ethtool_ops *ops = dev->ethtool_ops; - void *data; - - if (!ops->get_tunable) - return -EOPNOTSUPP; - if (copy_from_user(&tuna, useraddr, sizeof(tuna))) - return -EFAULT; - ret = ethtool_tunable_valid(&tuna); - if (ret) - return ret; - data = kmalloc(tuna.len, GFP_USER); - if (!data) - return -ENOMEM; - ret = ops->get_tunable(dev, &tuna, data); - if (ret) - goto out; - useraddr += sizeof(tuna); - ret = -EFAULT; - if (copy_to_user(useraddr, data, tuna.len)) - goto out; - ret = 0; - -out: - kfree(data); - return ret; -} - -static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) -{ - int ret; - struct ethtool_tunable tuna; - const struct ethtool_ops *ops = dev->ethtool_ops; - void *data; - - if (!ops->set_tunable) - return -EOPNOTSUPP; - if (copy_from_user(&tuna, useraddr, sizeof(tuna))) - return -EFAULT; - ret = ethtool_tunable_valid(&tuna); - if (ret) - return ret; - useraddr += sizeof(tuna); - data = memdup_user(useraddr, tuna.len); - if (IS_ERR(data)) - return PTR_ERR(data); - ret = ops->set_tunable(dev, &tuna, data); - - kfree(data); - return ret; -} - -static noinline_for_stack int -ethtool_get_per_queue_coalesce(struct net_device *dev, - void __user *useraddr, - struct ethtool_per_queue_op *per_queue_opt) -{ - u32 bit; - int ret; - DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); - - if (!dev->ethtool_ops->get_per_queue_coalesce) - return -EOPNOTSUPP; - - useraddr += sizeof(*per_queue_opt); - - bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, - MAX_NUM_QUEUE); - - for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { - struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; - - ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); - if (ret != 0) - return ret; - if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) - return -EFAULT; - useraddr += sizeof(coalesce); - } - - return 0; -} - -static noinline_for_stack int -ethtool_set_per_queue_coalesce(struct net_device *dev, - void __user *useraddr, - struct ethtool_per_queue_op *per_queue_opt) -{ - u32 bit; - int i, ret = 0; - int n_queue; - struct ethtool_coalesce *backup = NULL, *tmp = NULL; - DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); - - if ((!dev->ethtool_ops->set_per_queue_coalesce) || - (!dev->ethtool_ops->get_per_queue_coalesce)) - return -EOPNOTSUPP; - - useraddr += sizeof(*per_queue_opt); - - bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); - n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); - tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); - if (!backup) - return -ENOMEM; - - for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { - struct ethtool_coalesce coalesce; - - ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); - if (ret != 0) - goto roll_back; - - tmp++; - - if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { - ret = -EFAULT; - goto roll_back; - } - - ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); - if (ret != 0) - goto roll_back; - - useraddr += sizeof(coalesce); - } - -roll_back: - if (ret != 0) { - tmp = backup; - for_each_set_bit(i, queue_mask, bit) { - dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); - tmp++; - } - } - kfree(backup); - - return ret; -} - -static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, - void __user *useraddr, u32 sub_cmd) -{ - struct ethtool_per_queue_op per_queue_opt; - - if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) - return -EFAULT; - - if (per_queue_opt.sub_command != sub_cmd) - return -EINVAL; - - switch (per_queue_opt.sub_command) { - case ETHTOOL_GCOALESCE: - return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); - case ETHTOOL_SCOALESCE: - return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); - default: - return -EOPNOTSUPP; - }; -} - -static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) -{ - switch (tuna->id) { - case ETHTOOL_PHY_DOWNSHIFT: - case ETHTOOL_PHY_FAST_LINK_DOWN: - if (tuna->len != sizeof(u8) || - tuna->type_id != ETHTOOL_TUNABLE_U8) - return -EINVAL; - break; - case ETHTOOL_PHY_EDPD: - if (tuna->len != sizeof(u16) || - tuna->type_id != ETHTOOL_TUNABLE_U16) - return -EINVAL; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int get_phy_tunable(struct net_device *dev, void __user *useraddr) -{ - int ret; - struct ethtool_tunable tuna; - struct phy_device *phydev = dev->phydev; - void *data; - - if (!(phydev && phydev->drv && phydev->drv->get_tunable)) - return -EOPNOTSUPP; - - if (copy_from_user(&tuna, useraddr, sizeof(tuna))) - return -EFAULT; - ret = ethtool_phy_tunable_valid(&tuna); - if (ret) - return ret; - data = kmalloc(tuna.len, GFP_USER); - if (!data) - return -ENOMEM; - mutex_lock(&phydev->lock); - ret = phydev->drv->get_tunable(phydev, &tuna, data); - mutex_unlock(&phydev->lock); - if (ret) - goto out; - useraddr += sizeof(tuna); - ret = -EFAULT; - if (copy_to_user(useraddr, data, tuna.len)) - goto out; - ret = 0; - -out: - kfree(data); - return ret; -} - -static int set_phy_tunable(struct net_device *dev, void __user *useraddr) -{ - int ret; - struct ethtool_tunable tuna; - struct phy_device *phydev = dev->phydev; - void *data; - - if (!(phydev && phydev->drv && phydev->drv->set_tunable)) - return -EOPNOTSUPP; - if (copy_from_user(&tuna, useraddr, sizeof(tuna))) - return -EFAULT; - ret = ethtool_phy_tunable_valid(&tuna); - if (ret) - return ret; - useraddr += sizeof(tuna); - data = memdup_user(useraddr, tuna.len); - if (IS_ERR(data)) - return PTR_ERR(data); - mutex_lock(&phydev->lock); - ret = phydev->drv->set_tunable(phydev, &tuna, data); - mutex_unlock(&phydev->lock); - - kfree(data); - return ret; -} - -static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM }; - int rc; - - if (!dev->ethtool_ops->get_fecparam) - return -EOPNOTSUPP; - - rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); - if (rc) - return rc; - - if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) - return -EFAULT; - return 0; -} - -static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_fecparam fecparam; - - if (!dev->ethtool_ops->set_fecparam) - return -EOPNOTSUPP; - - if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) - return -EFAULT; - - return dev->ethtool_ops->set_fecparam(dev, &fecparam); -} - -/* The main entry point in this file. Called from net/core/dev_ioctl.c */ - -int dev_ethtool(struct net *net, struct ifreq *ifr) -{ - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - void __user *useraddr = ifr->ifr_data; - u32 ethcmd, sub_cmd; - int rc; - netdev_features_t old_features; - - if (!dev || !netif_device_present(dev)) - return -ENODEV; - - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) - return -EFAULT; - - if (ethcmd == ETHTOOL_PERQUEUE) { - if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) - return -EFAULT; - } else { - sub_cmd = ethcmd; - } - /* Allow some commands to be done by anyone */ - switch (sub_cmd) { - case ETHTOOL_GSET: - case ETHTOOL_GDRVINFO: - case ETHTOOL_GMSGLVL: - case ETHTOOL_GLINK: - case ETHTOOL_GCOALESCE: - case ETHTOOL_GRINGPARAM: - case ETHTOOL_GPAUSEPARAM: - case ETHTOOL_GRXCSUM: - case ETHTOOL_GTXCSUM: - case ETHTOOL_GSG: - case ETHTOOL_GSSET_INFO: - case ETHTOOL_GSTRINGS: - case ETHTOOL_GSTATS: - case ETHTOOL_GPHYSTATS: - case ETHTOOL_GTSO: - case ETHTOOL_GPERMADDR: - case ETHTOOL_GUFO: - case ETHTOOL_GGSO: - case ETHTOOL_GGRO: - case ETHTOOL_GFLAGS: - case ETHTOOL_GPFLAGS: - case ETHTOOL_GRXFH: - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_GRXCLSRLALL: - case ETHTOOL_GRXFHINDIR: - case ETHTOOL_GRSSH: - case ETHTOOL_GFEATURES: - case ETHTOOL_GCHANNELS: - case ETHTOOL_GET_TS_INFO: - case ETHTOOL_GEEE: - case ETHTOOL_GTUNABLE: - case ETHTOOL_PHY_GTUNABLE: - case ETHTOOL_GLINKSETTINGS: - case ETHTOOL_GFECPARAM: - break; - default: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - } - - if (dev->ethtool_ops->begin) { - rc = dev->ethtool_ops->begin(dev); - if (rc < 0) - return rc; - } - old_features = dev->features; - - switch (ethcmd) { - case ETHTOOL_GSET: - rc = ethtool_get_settings(dev, useraddr); - break; - case ETHTOOL_SSET: - rc = ethtool_set_settings(dev, useraddr); - break; - case ETHTOOL_GDRVINFO: - rc = ethtool_get_drvinfo(dev, useraddr); - break; - case ETHTOOL_GREGS: - rc = ethtool_get_regs(dev, useraddr); - break; - case ETHTOOL_GWOL: - rc = ethtool_get_wol(dev, useraddr); - break; - case ETHTOOL_SWOL: - rc = ethtool_set_wol(dev, useraddr); - break; - case ETHTOOL_GMSGLVL: - rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_msglevel); - break; - case ETHTOOL_SMSGLVL: - rc = ethtool_set_value_void(dev, useraddr, - dev->ethtool_ops->set_msglevel); - break; - case ETHTOOL_GEEE: - rc = ethtool_get_eee(dev, useraddr); - break; - case ETHTOOL_SEEE: - rc = ethtool_set_eee(dev, useraddr); - break; - case ETHTOOL_NWAY_RST: - rc = ethtool_nway_reset(dev); - break; - case ETHTOOL_GLINK: - rc = ethtool_get_link(dev, useraddr); - break; - case ETHTOOL_GEEPROM: - rc = ethtool_get_eeprom(dev, useraddr); - break; - case ETHTOOL_SEEPROM: - rc = ethtool_set_eeprom(dev, useraddr); - break; - case ETHTOOL_GCOALESCE: - rc = ethtool_get_coalesce(dev, useraddr); - break; - case ETHTOOL_SCOALESCE: - rc = ethtool_set_coalesce(dev, useraddr); - break; - case ETHTOOL_GRINGPARAM: - rc = ethtool_get_ringparam(dev, useraddr); - break; - case ETHTOOL_SRINGPARAM: - rc = ethtool_set_ringparam(dev, useraddr); - break; - case ETHTOOL_GPAUSEPARAM: - rc = ethtool_get_pauseparam(dev, useraddr); - break; - case ETHTOOL_SPAUSEPARAM: - rc = ethtool_set_pauseparam(dev, useraddr); - break; - case ETHTOOL_TEST: - rc = ethtool_self_test(dev, useraddr); - break; - case ETHTOOL_GSTRINGS: - rc = ethtool_get_strings(dev, useraddr); - break; - case ETHTOOL_PHYS_ID: - rc = ethtool_phys_id(dev, useraddr); - break; - case ETHTOOL_GSTATS: - rc = ethtool_get_stats(dev, useraddr); - break; - case ETHTOOL_GPERMADDR: - rc = ethtool_get_perm_addr(dev, useraddr); - break; - case ETHTOOL_GFLAGS: - rc = ethtool_get_value(dev, useraddr, ethcmd, - __ethtool_get_flags); - break; - case ETHTOOL_SFLAGS: - rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); - break; - case ETHTOOL_GPFLAGS: - rc = ethtool_get_value(dev, useraddr, ethcmd, - dev->ethtool_ops->get_priv_flags); - break; - case ETHTOOL_SPFLAGS: - rc = ethtool_set_value(dev, useraddr, - dev->ethtool_ops->set_priv_flags); - break; - case ETHTOOL_GRXFH: - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_GRXCLSRLALL: - rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); - break; - case ETHTOOL_SRXFH: - case ETHTOOL_SRXCLSRLDEL: - case ETHTOOL_SRXCLSRLINS: - rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); - break; - case ETHTOOL_FLASHDEV: - rc = ethtool_flash_device(dev, useraddr); - break; - case ETHTOOL_RESET: - rc = ethtool_reset(dev, useraddr); - break; - case ETHTOOL_GSSET_INFO: - rc = ethtool_get_sset_info(dev, useraddr); - break; - case ETHTOOL_GRXFHINDIR: - rc = ethtool_get_rxfh_indir(dev, useraddr); - break; - case ETHTOOL_SRXFHINDIR: - rc = ethtool_set_rxfh_indir(dev, useraddr); - break; - case ETHTOOL_GRSSH: - rc = ethtool_get_rxfh(dev, useraddr); - break; - case ETHTOOL_SRSSH: - rc = ethtool_set_rxfh(dev, useraddr); - break; - case ETHTOOL_GFEATURES: - rc = ethtool_get_features(dev, useraddr); - break; - case ETHTOOL_SFEATURES: - rc = ethtool_set_features(dev, useraddr); - break; - case ETHTOOL_GTXCSUM: - case ETHTOOL_GRXCSUM: - case ETHTOOL_GSG: - case ETHTOOL_GTSO: - case ETHTOOL_GGSO: - case ETHTOOL_GGRO: - rc = ethtool_get_one_feature(dev, useraddr, ethcmd); - break; - case ETHTOOL_STXCSUM: - case ETHTOOL_SRXCSUM: - case ETHTOOL_SSG: - case ETHTOOL_STSO: - case ETHTOOL_SGSO: - case ETHTOOL_SGRO: - rc = ethtool_set_one_feature(dev, useraddr, ethcmd); - break; - case ETHTOOL_GCHANNELS: - rc = ethtool_get_channels(dev, useraddr); - break; - case ETHTOOL_SCHANNELS: - rc = ethtool_set_channels(dev, useraddr); - break; - case ETHTOOL_SET_DUMP: - rc = ethtool_set_dump(dev, useraddr); - break; - case ETHTOOL_GET_DUMP_FLAG: - rc = ethtool_get_dump_flag(dev, useraddr); - break; - case ETHTOOL_GET_DUMP_DATA: - rc = ethtool_get_dump_data(dev, useraddr); - break; - case ETHTOOL_GET_TS_INFO: - rc = ethtool_get_ts_info(dev, useraddr); - break; - case ETHTOOL_GMODULEINFO: - rc = ethtool_get_module_info(dev, useraddr); - break; - case ETHTOOL_GMODULEEEPROM: - rc = ethtool_get_module_eeprom(dev, useraddr); - break; - case ETHTOOL_GTUNABLE: - rc = ethtool_get_tunable(dev, useraddr); - break; - case ETHTOOL_STUNABLE: - rc = ethtool_set_tunable(dev, useraddr); - break; - case ETHTOOL_GPHYSTATS: - rc = ethtool_get_phy_stats(dev, useraddr); - break; - case ETHTOOL_PERQUEUE: - rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); - break; - case ETHTOOL_GLINKSETTINGS: - rc = ethtool_get_link_ksettings(dev, useraddr); - break; - case ETHTOOL_SLINKSETTINGS: - rc = ethtool_set_link_ksettings(dev, useraddr); - break; - case ETHTOOL_PHY_GTUNABLE: - rc = get_phy_tunable(dev, useraddr); - break; - case ETHTOOL_PHY_STUNABLE: - rc = set_phy_tunable(dev, useraddr); - break; - case ETHTOOL_GFECPARAM: - rc = ethtool_get_fecparam(dev, useraddr); - break; - case ETHTOOL_SFECPARAM: - rc = ethtool_set_fecparam(dev, useraddr); - break; - default: - rc = -EOPNOTSUPP; - } - - if (dev->ethtool_ops->complete) - dev->ethtool_ops->complete(dev); - - if (old_features != dev->features) - netdev_features_change(dev); - - return rc; -} - -struct ethtool_rx_flow_key { - struct flow_dissector_key_basic basic; - union { - struct flow_dissector_key_ipv4_addrs ipv4; - struct flow_dissector_key_ipv6_addrs ipv6; - }; - struct flow_dissector_key_ports tp; - struct flow_dissector_key_ip ip; - struct flow_dissector_key_vlan vlan; - struct flow_dissector_key_eth_addrs eth_addrs; -} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ - -struct ethtool_rx_flow_match { - struct flow_dissector dissector; - struct ethtool_rx_flow_key key; - struct ethtool_rx_flow_key mask; -}; - -struct ethtool_rx_flow_rule * -ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) -{ - const struct ethtool_rx_flow_spec *fs = input->fs; - static struct in6_addr zero_addr = {}; - struct ethtool_rx_flow_match *match; - struct ethtool_rx_flow_rule *flow; - struct flow_action_entry *act; - - flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + - sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); - if (!flow) - return ERR_PTR(-ENOMEM); - - /* ethtool_rx supports only one single action per rule. */ - flow->rule = flow_rule_alloc(1); - if (!flow->rule) { - kfree(flow); - return ERR_PTR(-ENOMEM); - } - - match = (struct ethtool_rx_flow_match *)flow->priv; - flow->rule->match.dissector = &match->dissector; - flow->rule->match.mask = &match->mask; - flow->rule->match.key = &match->key; - - match->mask.basic.n_proto = htons(0xffff); - - switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { - case ETHER_FLOW: { - const struct ethhdr *ether_spec, *ether_m_spec; - - ether_spec = &fs->h_u.ether_spec; - ether_m_spec = &fs->m_u.ether_spec; - - if (!is_zero_ether_addr(ether_m_spec->h_source)) { - ether_addr_copy(match->key.eth_addrs.src, - ether_spec->h_source); - ether_addr_copy(match->mask.eth_addrs.src, - ether_m_spec->h_source); - } - if (!is_zero_ether_addr(ether_m_spec->h_dest)) { - ether_addr_copy(match->key.eth_addrs.dst, - ether_spec->h_dest); - ether_addr_copy(match->mask.eth_addrs.dst, - ether_m_spec->h_dest); - } - if (ether_m_spec->h_proto) { - match->key.basic.n_proto = ether_spec->h_proto; - match->mask.basic.n_proto = ether_m_spec->h_proto; - } - } - break; - case TCP_V4_FLOW: - case UDP_V4_FLOW: { - const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; - - match->key.basic.n_proto = htons(ETH_P_IP); - - v4_spec = &fs->h_u.tcp_ip4_spec; - v4_m_spec = &fs->m_u.tcp_ip4_spec; - - if (v4_m_spec->ip4src) { - match->key.ipv4.src = v4_spec->ip4src; - match->mask.ipv4.src = v4_m_spec->ip4src; - } - if (v4_m_spec->ip4dst) { - match->key.ipv4.dst = v4_spec->ip4dst; - match->mask.ipv4.dst = v4_m_spec->ip4dst; - } - if (v4_m_spec->ip4src || - v4_m_spec->ip4dst) { - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); - match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = - offsetof(struct ethtool_rx_flow_key, ipv4); - } - if (v4_m_spec->psrc) { - match->key.tp.src = v4_spec->psrc; - match->mask.tp.src = v4_m_spec->psrc; - } - if (v4_m_spec->pdst) { - match->key.tp.dst = v4_spec->pdst; - match->mask.tp.dst = v4_m_spec->pdst; - } - if (v4_m_spec->psrc || - v4_m_spec->pdst) { - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); - match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = - offsetof(struct ethtool_rx_flow_key, tp); - } - if (v4_m_spec->tos) { - match->key.ip.tos = v4_spec->tos; - match->mask.ip.tos = v4_m_spec->tos; - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IP); - match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = - offsetof(struct ethtool_rx_flow_key, ip); - } - } - break; - case TCP_V6_FLOW: - case UDP_V6_FLOW: { - const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; - - match->key.basic.n_proto = htons(ETH_P_IPV6); - - v6_spec = &fs->h_u.tcp_ip6_spec; - v6_m_spec = &fs->m_u.tcp_ip6_spec; - if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { - memcpy(&match->key.ipv6.src, v6_spec->ip6src, - sizeof(match->key.ipv6.src)); - memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, - sizeof(match->mask.ipv6.src)); - } - if (memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) { - memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, - sizeof(match->key.ipv6.dst)); - memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, - sizeof(match->mask.ipv6.dst)); - } - if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) || - memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); - match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = - offsetof(struct ethtool_rx_flow_key, ipv6); - } - if (v6_m_spec->psrc) { - match->key.tp.src = v6_spec->psrc; - match->mask.tp.src = v6_m_spec->psrc; - } - if (v6_m_spec->pdst) { - match->key.tp.dst = v6_spec->pdst; - match->mask.tp.dst = v6_m_spec->pdst; - } - if (v6_m_spec->psrc || - v6_m_spec->pdst) { - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); - match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = - offsetof(struct ethtool_rx_flow_key, tp); - } - if (v6_m_spec->tclass) { - match->key.ip.tos = v6_spec->tclass; - match->mask.ip.tos = v6_m_spec->tclass; - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IP); - match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = - offsetof(struct ethtool_rx_flow_key, ip); - } - } - break; - default: - ethtool_rx_flow_rule_destroy(flow); - return ERR_PTR(-EINVAL); - } - - switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { - case TCP_V4_FLOW: - case TCP_V6_FLOW: - match->key.basic.ip_proto = IPPROTO_TCP; - break; - case UDP_V4_FLOW: - case UDP_V6_FLOW: - match->key.basic.ip_proto = IPPROTO_UDP; - break; - } - match->mask.basic.ip_proto = 0xff; - - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); - match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = - offsetof(struct ethtool_rx_flow_key, basic); - - if (fs->flow_type & FLOW_EXT) { - const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; - const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; - - if (ext_m_spec->vlan_etype) { - match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; - match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; - } - - if (ext_m_spec->vlan_tci) { - match->key.vlan.vlan_id = - ntohs(ext_h_spec->vlan_tci) & 0x0fff; - match->mask.vlan.vlan_id = - ntohs(ext_m_spec->vlan_tci) & 0x0fff; - - match->key.vlan.vlan_dei = - !!(ext_h_spec->vlan_tci & htons(0x1000)); - match->mask.vlan.vlan_dei = - !!(ext_m_spec->vlan_tci & htons(0x1000)); - - match->key.vlan.vlan_priority = - (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; - match->mask.vlan.vlan_priority = - (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; - } - - if (ext_m_spec->vlan_etype || - ext_m_spec->vlan_tci) { - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_VLAN); - match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = - offsetof(struct ethtool_rx_flow_key, vlan); - } - } - if (fs->flow_type & FLOW_MAC_EXT) { - const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; - const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; - - memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, - ETH_ALEN); - memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, - ETH_ALEN); - - match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); - match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = - offsetof(struct ethtool_rx_flow_key, eth_addrs); - } - - act = &flow->rule->action.entries[0]; - switch (fs->ring_cookie) { - case RX_CLS_FLOW_DISC: - act->id = FLOW_ACTION_DROP; - break; - case RX_CLS_FLOW_WAKE: - act->id = FLOW_ACTION_WAKE; - break; - default: - act->id = FLOW_ACTION_QUEUE; - if (fs->flow_type & FLOW_RSS) - act->queue.ctx = input->rss_ctx; - - act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); - act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); - break; - } - - return flow; -} -EXPORT_SYMBOL(ethtool_rx_flow_rule_create); - -void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) -{ - kfree(flow->rule); - kfree(flow); -} -EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile new file mode 100644 index 000000000000..7e5c9eb85c90 --- /dev/null +++ b/net/ethtool/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-y += ioctl.o diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c new file mode 100644 index 000000000000..cd9bc67381b2 --- /dev/null +++ b/net/ethtool/ioctl.c @@ -0,0 +1,3116 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ + return netif_carrier_ok(dev) ? 1 : 0; +} +EXPORT_SYMBOL(ethtool_op_get_link); + +int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) +{ + info->so_timestamping = + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + info->phc_index = -1; + return 0; +} +EXPORT_SYMBOL(ethtool_op_get_ts_info); + +/* Handlers for each ethtool command */ + +#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) + +static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", + [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", + [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", + [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", + [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", + [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", + [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", + [NETIF_F_RXFCS_BIT] = "rx-fcs", + [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", + [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", + [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", + [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", + [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", +}; + +static const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", + [ETH_RSS_HASH_CRC32_BIT] = "crc32", +}; + +static const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", +}; + +static const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", + [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", + [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", +}; + +static int ethtool_get_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gfeatures cmd = { + .cmd = ETHTOOL_GFEATURES, + .size = ETHTOOL_DEV_FEATURE_WORDS, + }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + u32 __user *sizeaddr; + u32 copy_size; + int i; + + /* in case feature bits run out again */ + BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); + + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + features[i].available = (u32)(dev->hw_features >> (32 * i)); + features[i].requested = (u32)(dev->wanted_features >> (32 * i)); + features[i].active = (u32)(dev->features >> (32 * i)); + features[i].never_changed = + (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); + } + + sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); + if (get_user(copy_size, sizeaddr)) + return -EFAULT; + + if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) + copy_size = ETHTOOL_DEV_FEATURE_WORDS; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_sfeatures cmd; + struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + netdev_features_t wanted = 0, valid = 0; + int i, ret = 0; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + + if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) + return -EINVAL; + + if (copy_from_user(features, useraddr, sizeof(features))) + return -EFAULT; + + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + valid |= (netdev_features_t)features[i].valid << (32 * i); + wanted |= (netdev_features_t)features[i].requested << (32 * i); + } + + if (valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; + + if (valid & ~dev->hw_features) { + valid &= dev->hw_features; + ret |= ETHTOOL_F_UNSUPPORTED; + } + + dev->wanted_features &= ~valid; + dev->wanted_features |= wanted & valid; + __netdev_update_features(dev); + + if ((dev->wanted_features ^ dev->features) & valid) + ret |= ETHTOOL_F_WISH; + + return ret; +} + +static int __ethtool_get_sset_count(struct net_device *dev, int sset) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (sset == ETH_SS_FEATURES) + return ARRAY_SIZE(netdev_features_strings); + + if (sset == ETH_SS_RSS_HASH_FUNCS) + return ARRAY_SIZE(rss_hash_func_strings); + + if (sset == ETH_SS_TUNABLES) + return ARRAY_SIZE(tunable_strings); + + if (sset == ETH_SS_PHY_TUNABLES) + return ARRAY_SIZE(phy_tunable_strings); + + if (sset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + return phy_ethtool_get_sset_count(dev->phydev); + + if (ops->get_sset_count && ops->get_strings) + return ops->get_sset_count(dev, sset); + else + return -EOPNOTSUPP; +} + +static void __ethtool_get_strings(struct net_device *dev, + u32 stringset, u8 *data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (stringset == ETH_SS_FEATURES) + memcpy(data, netdev_features_strings, + sizeof(netdev_features_strings)); + else if (stringset == ETH_SS_RSS_HASH_FUNCS) + memcpy(data, rss_hash_func_strings, + sizeof(rss_hash_func_strings)); + else if (stringset == ETH_SS_TUNABLES) + memcpy(data, tunable_strings, sizeof(tunable_strings)); + else if (stringset == ETH_SS_PHY_TUNABLES) + memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); + else if (stringset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + phy_ethtool_get_strings(dev->phydev, data); + else + /* ops->get_strings is valid because checked earlier */ + ops->get_strings(dev, stringset, data); +} + +static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) +{ + /* feature masks of legacy discrete ethtool ops */ + + switch (eth_cmd) { + case ETHTOOL_GTXCSUM: + case ETHTOOL_STXCSUM: + return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; + case ETHTOOL_GRXCSUM: + case ETHTOOL_SRXCSUM: + return NETIF_F_RXCSUM; + case ETHTOOL_GSG: + case ETHTOOL_SSG: + return NETIF_F_SG; + case ETHTOOL_GTSO: + case ETHTOOL_STSO: + return NETIF_F_ALL_TSO; + case ETHTOOL_GGSO: + case ETHTOOL_SGSO: + return NETIF_F_GSO; + case ETHTOOL_GGRO: + case ETHTOOL_SGRO: + return NETIF_F_GRO; + default: + BUG(); + } +} + +static int ethtool_get_one_feature(struct net_device *dev, + char __user *useraddr, u32 ethcmd) +{ + netdev_features_t mask = ethtool_get_feature_mask(ethcmd); + struct ethtool_value edata = { + .cmd = ethcmd, + .data = !!(dev->features & mask), + }; + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_one_feature(struct net_device *dev, + void __user *useraddr, u32 ethcmd) +{ + struct ethtool_value edata; + netdev_features_t mask; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + mask = ethtool_get_feature_mask(ethcmd); + mask &= dev->hw_features; + if (!mask) + return -EOPNOTSUPP; + + if (edata.data) + dev->wanted_features |= mask; + else + dev->wanted_features &= ~mask; + + __netdev_update_features(dev); + + return 0; +} + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) + flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) + flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) + flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) + flags |= ETH_FLAG_RXHASH; + + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) + return -EINVAL; + + if (data & ETH_FLAG_LRO) + features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_TX; + if (data & ETH_FLAG_NTUPLE) + features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; + + /* allow changing only bits set in hw_features */ + changed = (features ^ dev->features) & ETH_ALL_FEATURES; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (features & changed); + + __netdev_update_features(dev); + + return 0; +} + +/* Given two link masks, AND them together and save the result in dst. */ +void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, + struct ethtool_link_ksettings *src) +{ + unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); + unsigned int idx = 0; + + for (; idx < size; idx++) { + dst->link_modes.supported[idx] &= + src->link_modes.supported[idx]; + dst->link_modes.advertising[idx] &= + src->link_modes.advertising[idx]; + } +} +EXPORT_SYMBOL(ethtool_intersect_link_masks); + +void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, + u32 legacy_u32) +{ + bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); + dst[0] = legacy_u32; +} +EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); + +/* return false if src had higher bits set. lower bits always updated. */ +bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, + const unsigned long *src) +{ + bool retval = true; + + /* TODO: following test will soon always be true */ + if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); + + bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_fill(ext, 32); + bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + if (bitmap_intersects(ext, src, + __ETHTOOL_LINK_MODE_MASK_NBITS)) { + /* src mask goes beyond bit 31 */ + retval = false; + } + } + *legacy_u32 = src[0]; + return retval; +} +EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); + +/* return false if legacy contained non-0 deprecated fields + * maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +static bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +/* return false if ksettings link modes had higher bits + * set. legacy_settings always updated (best effort) + */ +static bool +convert_link_ksettings_to_legacy_settings( + struct ethtool_cmd *legacy_settings, + const struct ethtool_link_ksettings *link_ksettings) +{ + bool retval = true; + + memset(legacy_settings, 0, sizeof(*legacy_settings)); + /* this also clears the deprecated fields in legacy structure: + * __u8 transceiver; + * __u32 maxtxpkt; + * __u32 maxrxpkt; + */ + + retval &= ethtool_convert_link_mode_to_legacy_u32( + &legacy_settings->supported, + link_ksettings->link_modes.supported); + retval &= ethtool_convert_link_mode_to_legacy_u32( + &legacy_settings->advertising, + link_ksettings->link_modes.advertising); + retval &= ethtool_convert_link_mode_to_legacy_u32( + &legacy_settings->lp_advertising, + link_ksettings->link_modes.lp_advertising); + ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); + legacy_settings->duplex + = link_ksettings->base.duplex; + legacy_settings->port + = link_ksettings->base.port; + legacy_settings->phy_address + = link_ksettings->base.phy_address; + legacy_settings->autoneg + = link_ksettings->base.autoneg; + legacy_settings->mdio_support + = link_ksettings->base.mdio_support; + legacy_settings->eth_tp_mdix + = link_ksettings->base.eth_tp_mdix; + legacy_settings->eth_tp_mdix_ctrl + = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; + return retval; +} + +/* number of 32-bit words to store the user's link mode bitmaps */ +#define __ETHTOOL_LINK_MODE_MASK_NU32 \ + DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) + +/* layout of the struct passed from/to userland */ +struct ethtool_link_usettings { + struct ethtool_link_settings base; + struct { + __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + } link_modes; +}; + +/* Internal kernel helper to query a device ethtool_link_settings. */ +int __ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *link_ksettings) +{ + ASSERT_RTNL(); + + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); +} +EXPORT_SYMBOL(__ethtool_get_link_ksettings); + +/* convert ethtool_link_usettings in user space to a kernel internal + * ethtool_link_ksettings. return 0 on success, errno on error. + */ +static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, + const void __user *from) +{ + struct ethtool_link_usettings link_usettings; + + if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) + return -EFAULT; + + memcpy(&to->base, &link_usettings.base, sizeof(to->base)); + bitmap_from_arr32(to->link_modes.supported, + link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_from_arr32(to->link_modes.advertising, + link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_from_arr32(to->link_modes.lp_advertising, + link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + + return 0; +} + +/* convert a kernel internal ethtool_link_ksettings to + * ethtool_link_usettings in user space. return 0 on success, errno on + * error. + */ +static int +store_link_ksettings_for_user(void __user *to, + const struct ethtool_link_ksettings *from) +{ + struct ethtool_link_usettings link_usettings; + + memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); + bitmap_to_arr32(link_usettings.link_modes.supported, + from->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_arr32(link_usettings.link_modes.advertising, + from->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_arr32(link_usettings.link_modes.lp_advertising, + from->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + + if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) + return -EFAULT; + + return 0; +} + +/* Query device for its ethtool_link_settings. */ +static int ethtool_get_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err = 0; + struct ethtool_link_ksettings link_ksettings; + + ASSERT_RTNL(); + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + /* handle bitmap nbits handshake */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) { + /* wrong link mode nbits requested */ + memset(&link_ksettings, 0, sizeof(link_ksettings)); + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + /* send back number of words required as negative val */ + compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, + "need too many bits for link modes!"); + link_ksettings.base.link_mode_masks_nwords + = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); + + /* copy the base fields back to user, not the link + * mode bitmaps + */ + if (copy_to_user(useraddr, &link_ksettings.base, + sizeof(link_ksettings.base))) + return -EFAULT; + + return 0; + } + + /* handshake successful: user/kernel agree on + * link_mode_masks_nwords + */ + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); + if (err < 0) + return err; + + /* make sure we tell the right values to user */ + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + + return store_link_ksettings_for_user(useraddr, &link_ksettings); +} + +/* Update device ethtool_link_settings. */ +static int ethtool_set_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err; + struct ethtool_link_ksettings link_ksettings; + + ASSERT_RTNL(); + + if (!dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + /* make sure nbits field has expected value */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + /* copy the whole structure, now that we know it has expected + * format + */ + err = load_link_ksettings_from_user(&link_ksettings, useraddr); + if (err) + return err; + + /* re-check nwords field, just in case */ + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); +} + +/* Query device for its ethtool_cmd settings. + * + * Backward compatibility note: for compatibility with legacy ethtool, this is + * now implemented via get_link_ksettings. When driver reports higher link mode + * bits, a kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful (only the + * lower link mode bits reported back to user). Deprecated fields from + * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. + */ +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_link_ksettings link_ksettings; + struct ethtool_cmd cmd; + int err; + + ASSERT_RTNL(); + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); + if (err < 0) + return err; + convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); + + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + + return 0; +} + +/* Update device link settings with given ethtool_cmd. + * + * Backward compatibility note: for compatibility with legacy ethtool, this is + * now always implemented via set_link_settings. When user's request updates + * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to recommend user to + * upgrade ethtool, and the request is rejected. + */ +static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_link_ksettings link_ksettings; + struct ethtool_cmd cmd; + + ASSERT_RTNL(); + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + if (!dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) + return -EINVAL; + link_ksettings.base.link_mode_masks_nwords = + __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); +} + +static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_drvinfo info; + const struct ethtool_ops *ops = dev->ethtool_ops; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; + if (ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } + + /* + * this method of obtaining string set info is deprecated; + * Use ETHTOOL_GSSET_INFO instead. + */ + if (ops->get_sset_count) { + int rc; + + rc = ops->get_sset_count(dev, ETH_SS_TEST); + if (rc >= 0) + info.testinfo_len = rc; + rc = ops->get_sset_count(dev, ETH_SS_STATS); + if (rc >= 0) + info.n_stats = rc; + rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); + if (rc >= 0) + info.n_priv_flags = rc; + } + if (ops->get_regs_len) { + int ret = ops->get_regs_len(dev); + + if (ret > 0) + info.regdump_len = ret; + } + + if (ops->get_eeprom_len) + info.eedump_len = ops->get_eeprom_len(dev); + + if (!info.fw_version[0]) + devlink_compat_running_version(dev, info.fw_version, + sizeof(info.fw_version)); + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_sset_info info; + u64 sset_mask; + int i, idx = 0, n_bits = 0, ret, rc; + u32 *info_buf = NULL; + + if (copy_from_user(&info, useraddr, sizeof(info))) + return -EFAULT; + + /* store copy of mask, because we zero struct later on */ + sset_mask = info.sset_mask; + if (!sset_mask) + return 0; + + /* calculate size of return buffer */ + n_bits = hweight64(sset_mask); + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GSSET_INFO; + + info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); + if (!info_buf) + return -ENOMEM; + + /* + * fill return buffer based on input bitmask and successful + * get_sset_count return + */ + for (i = 0; i < 64; i++) { + if (!(sset_mask & (1ULL << i))) + continue; + + rc = __ethtool_get_sset_count(dev, i); + if (rc >= 0) { + info.sset_mask |= (1ULL << i); + info_buf[idx++] = rc; + } + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, sizeof(info))) + goto out; + + useraddr += offsetof(struct ethtool_sset_info, data); + if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + goto out; + + ret = 0; + +out: + kfree(info_buf); + return ret; +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS && + copy_to_user(useraddr, &info, info_size)) + return -EFAULT; + + return 0; +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info.flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info.cmd != cmd) + return -EINVAL; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), + GFP_USER); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, info_size)) + goto err_out; + + if (rule_buf) { + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + if (copy_to_user(useraddr, rule_buf, + info.rule_cnt * sizeof(u32))) + goto err_out; + } + ret = 0; + +err_out: + kfree(rule_buf); + + return ret; +} + +static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, + struct ethtool_rxnfc *rx_rings, + u32 size) +{ + int i; + + if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) + return -EFAULT; + + /* Validate ring indices */ + for (i = 0; i < size; i++) + if (indir[i] >= rx_rings->data) + return -EINVAL; + + return 0; +} + +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; + +void netdev_rss_key_fill(void *buffer, size_t len) +{ + BUG_ON(len > sizeof(netdev_rss_key)); + net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); + memcpy(buffer, netdev_rss_key, len); +} +EXPORT_SYMBOL(netdev_rss_key_fill); + +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + u32 user_size, dev_size; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&user_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(user_size))) + return -EFAULT; + + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), + &dev_size, sizeof(dev_size))) + return -EFAULT; + + /* If the user buffer size is 0, this is just a query for the + * device table size. Otherwise, if it's smaller than the + * device table size it's an error. + */ + if (user_size < dev_size) + return user_size == 0 ? 0 : -EINVAL; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh_indir, ring_index[0]), + indir, dev_size * sizeof(indir[0]))) + ret = -EFAULT; + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rxnfc rx_rings; + u32 user_size, dev_size, i; + u32 *indir; + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); + + if (!ops->get_rxfh_indir_size || !ops->set_rxfh || + !ops->get_rxnfc) + return -EOPNOTSUPP; + + dev_size = ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&user_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(user_size))) + return -EFAULT; + + if (user_size != 0 && user_size != dev_size) + return -EINVAL; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret) + goto out; + + if (user_size == 0) { + for (i = 0; i < dev_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + ret = ethtool_copy_validate_indir(indir, + useraddr + ringidx_offset, + &rx_rings, + dev_size); + if (ret) + goto out; + } + + ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, + void __user *useraddr) +{ + int ret; + const struct ethtool_ops *ops = dev->ethtool_ops; + u32 user_indir_size, user_key_size; + u32 dev_indir_size = 0, dev_key_size = 0; + struct ethtool_rxfh rxfh; + u32 total_size; + u32 indir_bytes; + u32 *indir = NULL; + u8 dev_hfunc = 0; + u8 *hkey = NULL; + u8 *rss_config; + + if (!ops->get_rxfh) + return -EOPNOTSUPP; + + if (ops->get_rxfh_indir_size) + dev_indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + dev_key_size = ops->get_rxfh_key_size(dev); + + if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) + return -EFAULT; + user_indir_size = rxfh.indir_size; + user_key_size = rxfh.key_size; + + /* Check that reserved fields are 0 for now */ + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) + return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->get_rxfh_context) + return -EOPNOTSUPP; + + rxfh.indir_size = dev_indir_size; + rxfh.key_size = dev_key_size; + if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) + return -EFAULT; + + if ((user_indir_size && (user_indir_size != dev_indir_size)) || + (user_key_size && (user_key_size != dev_key_size))) + return -EINVAL; + + indir_bytes = user_indir_size * sizeof(indir[0]); + total_size = indir_bytes + user_key_size; + rss_config = kzalloc(total_size, GFP_USER); + if (!rss_config) + return -ENOMEM; + + if (user_indir_size) + indir = (u32 *)rss_config; + + if (user_key_size) + hkey = rss_config + indir_bytes; + + if (rxfh.rss_context) + ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, + &dev_hfunc, + rxfh.rss_context); + else + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (ret) + goto out; + + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), + &dev_hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) { + ret = -EFAULT; + } +out: + kfree(rss_config); + + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, + void __user *useraddr) +{ + int ret; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings; + struct ethtool_rxfh rxfh; + u32 dev_indir_size = 0, dev_key_size = 0, i; + u32 *indir = NULL, indir_bytes = 0; + u8 *hkey = NULL; + u8 *rss_config; + u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + bool delete = false; + + if (!ops->get_rxnfc || !ops->set_rxfh) + return -EOPNOTSUPP; + + if (ops->get_rxfh_indir_size) + dev_indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + dev_key_size = ops->get_rxfh_key_size(dev); + + if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) + return -EFAULT; + + /* Check that reserved fields are 0 for now */ + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) + return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->set_rxfh_context) + return -EOPNOTSUPP; + + /* If either indir, hash key or function is valid, proceed further. + * Must request at least one change: indir size, hash key or function. + */ + if ((rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && + rxfh.indir_size != dev_indir_size) || + (rxfh.key_size && (rxfh.key_size != dev_key_size)) || + (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) + return -EINVAL; + + if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + indir_bytes = dev_indir_size * sizeof(indir[0]); + + rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); + if (!rss_config) + return -ENOMEM; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret) + goto out; + + /* rxfh.indir_size == 0 means reset the indir table to default (master + * context) or delete the context (other RSS contexts). + * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. + */ + if (rxfh.indir_size && + rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { + indir = (u32 *)rss_config; + ret = ethtool_copy_validate_indir(indir, + useraddr + rss_cfg_offset, + &rx_rings, + rxfh.indir_size); + if (ret) + goto out; + } else if (rxfh.indir_size == 0) { + if (rxfh.rss_context == 0) { + indir = (u32 *)rss_config; + for (i = 0; i < dev_indir_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + delete = true; + } + } + + if (rxfh.key_size) { + hkey = rss_config + indir_bytes; + if (copy_from_user(hkey, + useraddr + rss_cfg_offset + indir_bytes, + rxfh.key_size)) { + ret = -EFAULT; + goto out; + } + } + + if (rxfh.rss_context) + ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, + &rxfh.rss_context, delete); + else + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), + &rxfh.rss_context, sizeof(rxfh.rss_context))) + ret = -EFAULT; + + if (!rxfh.rss_context) { + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; + } + +out: + kfree(rss_config); + return ret; +} + +static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_regs regs; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *regbuf; + int reglen, ret; + + if (!ops->get_regs || !ops->get_regs_len) + return -EOPNOTSUPP; + + if (copy_from_user(®s, useraddr, sizeof(regs))) + return -EFAULT; + + reglen = ops->get_regs_len(dev); + if (reglen <= 0) + return reglen; + + if (regs.len > reglen) + regs.len = reglen; + + regbuf = vzalloc(reglen); + if (!regbuf) + return -ENOMEM; + + if (regs.len < reglen) + reglen = regs.len; + + ops->get_regs(dev, ®s, regbuf); + + ret = -EFAULT; + if (copy_to_user(useraddr, ®s, sizeof(regs))) + goto out; + useraddr += offsetof(struct ethtool_regs, data); + if (copy_to_user(useraddr, regbuf, reglen)) + goto out; + ret = 0; + + out: + vfree(regbuf); + return ret; +} + +static int ethtool_reset(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value reset; + int ret; + + if (!dev->ethtool_ops->reset) + return -EOPNOTSUPP; + + if (copy_from_user(&reset, useraddr, sizeof(reset))) + return -EFAULT; + + ret = dev->ethtool_ops->reset(dev, &reset.data); + if (ret) + return ret; + + if (copy_to_user(useraddr, &reset, sizeof(reset))) + return -EFAULT; + return 0; +} + +static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + memset(&wol, 0, sizeof(struct ethtool_wolinfo)); + wol.cmd = ETHTOOL_GWOL; + dev->ethtool_ops->get_wol(dev, &wol); + + if (copy_to_user(useraddr, &wol, sizeof(wol))) + return -EFAULT; + return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol; + int ret; + + if (!dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + if (copy_from_user(&wol, useraddr, sizeof(wol))) + return -EFAULT; + + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + return ret; + + dev->wol_enabled = !!wol.wolopts; + + return 0; +} + +static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + int rc; + + if (!dev->ethtool_ops->get_eee) + return -EOPNOTSUPP; + + memset(&edata, 0, sizeof(struct ethtool_eee)); + edata.cmd = ETHTOOL_GEEE; + rc = dev->ethtool_ops->get_eee(dev, &edata); + + if (rc) + return rc; + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + + if (!dev->ethtool_ops->set_eee) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return dev->ethtool_ops->set_eee(dev, &edata); +} + +static int ethtool_nway_reset(struct net_device *dev) +{ + if (!dev->ethtool_ops->nway_reset) + return -EOPNOTSUPP; + + return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, + int (*getter)(struct net_device *, + struct ethtool_eeprom *, u8 *), + u32 total_len) +{ + struct ethtool_eeprom eeprom; + void __user *userbuf = useraddr + sizeof(eeprom); + u32 bytes_remaining; + u8 *data; + int ret = 0; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > total_len) + return -EINVAL; + + data = kmalloc(PAGE_SIZE, GFP_USER); + if (!data) + return -ENOMEM; + + bytes_remaining = eeprom.len; + while (bytes_remaining > 0) { + eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); + + ret = getter(dev, &eeprom, data); + if (ret) + break; + if (copy_to_user(userbuf, data, eeprom.len)) { + ret = -EFAULT; + break; + } + userbuf += eeprom.len; + eeprom.offset += eeprom.len; + bytes_remaining -= eeprom.len; + } + + eeprom.len = userbuf - (useraddr + sizeof(eeprom)); + eeprom.offset -= eeprom.len; + if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) + ret = -EFAULT; + + kfree(data); + return ret; +} + +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_eeprom || !ops->get_eeprom_len || + !ops->get_eeprom_len(dev)) + return -EOPNOTSUPP; + + return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, + ops->get_eeprom_len(dev)); +} + +static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + const struct ethtool_ops *ops = dev->ethtool_ops; + void __user *userbuf = useraddr + sizeof(eeprom); + u32 bytes_remaining; + u8 *data; + int ret = 0; + + if (!ops->set_eeprom || !ops->get_eeprom_len || + !ops->get_eeprom_len(dev)) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(PAGE_SIZE, GFP_USER); + if (!data) + return -ENOMEM; + + bytes_remaining = eeprom.len; + while (bytes_remaining > 0) { + eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); + + if (copy_from_user(data, userbuf, eeprom.len)) { + ret = -EFAULT; + break; + } + ret = ops->set_eeprom(dev, &eeprom, data); + if (ret) + break; + userbuf += eeprom.len; + eeprom.offset += eeprom.len; + bytes_remaining -= eeprom.len; + } + + kfree(data); + return ret; +} + +static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_coalesce(dev, &coalesce); + + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_coalesce coalesce; + + if (!dev->ethtool_ops->set_coalesce) + return -EOPNOTSUPP; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) + return -EFAULT; + + return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_ringparam(dev, &ringparam); + + if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) + return -EFAULT; + + dev->ethtool_ops->get_ringparam(dev, &max); + + /* ensure new ring parameters are within the maximums */ + if (ringparam.rx_pending > max.rx_max_pending || + ringparam.rx_mini_pending > max.rx_mini_max_pending || + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || + ringparam.tx_pending > max.tx_max_pending) + return -EINVAL; + + return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + if (copy_to_user(useraddr, &channels, sizeof(channels))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; + u16 from_channel, to_channel; + u32 max_rx_in_use = 0; + unsigned int i; + + if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + if (copy_from_user(&channels, useraddr, sizeof(channels))) + return -EFAULT; + + dev->ethtool_ops->get_channels(dev, &curr); + + /* ensure new counts are within the maximums */ + if (channels.rx_count > curr.max_rx || + channels.tx_count > curr.max_tx || + channels.combined_count > curr.max_combined || + channels.other_count > curr.max_other) + return -EINVAL; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + + /* Disabling channels, query zero-copy AF_XDP sockets */ + from_channel = channels.combined_count + + min(channels.rx_count, channels.tx_count); + to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); + for (i = from_channel; i < to_channel; i++) + if (xdp_get_umem_from_qid(dev, i)) + return -EINVAL; + + return dev->ethtool_ops->set_channels(dev, &channels); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM }; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + + if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam; + + if (!dev->ethtool_ops->set_pauseparam) + return -EOPNOTSUPP; + + if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) + return -EFAULT; + + return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int ethtool_self_test(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_test test; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret, test_len; + + if (!ops->self_test || !ops->get_sset_count) + return -EOPNOTSUPP; + + test_len = ops->get_sset_count(dev, ETH_SS_TEST); + if (test_len < 0) + return test_len; + WARN_ON(test_len == 0); + + if (copy_from_user(&test, useraddr, sizeof(test))) + return -EFAULT; + + test.len = test_len; + data = kmalloc_array(test_len, sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->self_test(dev, &test, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &test, sizeof(test))) + goto out; + useraddr += sizeof(test); + if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + u8 *data; + int ret; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + ret = __ethtool_get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + if (ret > S32_MAX / ETH_GSTRING_LEN) + return -ENOMEM; + WARN_ON_ONCE(!ret); + + gstrings.len = ret; + + if (gstrings.len) { + data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); + if (!data) + return -ENOMEM; + + __ethtool_get_strings(dev, gstrings.string_set, data); + } else { + data = NULL; + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (gstrings.len && + copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + +out: + vfree(data); + return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value id; + static bool busy; + const struct ethtool_ops *ops = dev->ethtool_ops; + int rc; + + if (!ops->set_phys_id) + return -EOPNOTSUPP; + + if (busy) + return -EBUSY; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + if (rc < 0) + return rc; + + /* Drop the RTNL lock while waiting, but prevent reentry or + * removal of the device. + */ + busy = true; + dev_hold(dev); + rtnl_unlock(); + + if (rc == 0) { + /* Driver will handle this itself */ + schedule_timeout_interruptible( + id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); + } else { + /* Driver expects to be called at twice the frequency in rc */ + int n = rc * 2, i, interval = HZ / n; + + /* Count down seconds */ + do { + /* Count down iterations per second */ + i = n; + do { + rtnl_lock(); + rc = ops->set_phys_id(dev, + (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(interval); + } while (!signal_pending(current) && --i != 0); + } while (!signal_pending(current) && + (id.data == 0 || --id.data != 0)); + } + + rtnl_lock(); + dev_put(dev); + busy = false; + + (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + return rc; +} + +static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret, n_stats; + + if (!ops->get_ethtool_stats || !ops->get_sset_count) + return -EOPNOTSUPP; + + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); + if (n_stats < 0) + return n_stats; + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + WARN_ON_ONCE(!n_stats); + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + + if (n_stats) { + data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!data) + return -ENOMEM; + ops->get_ethtool_stats(dev, &stats, data); + } else { + data = NULL; + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + vfree(data); + return ret; +} + +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + struct ethtool_stats stats; + u64 *data; + int ret, n_stats; + + if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) + return -EOPNOTSUPP; + + if (dev->phydev && !ops->get_ethtool_phy_stats) + n_stats = phy_ethtool_get_sset_count(dev->phydev); + else + n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); + if (n_stats < 0) + return n_stats; + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + WARN_ON_ONCE(!n_stats); + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + + if (n_stats) { + data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!data) + return -ENOMEM; + + if (dev->phydev && !ops->get_ethtool_phy_stats) { + ret = phy_ethtool_get_stats(dev->phydev, &stats, data); + if (ret < 0) + goto out; + } else { + ops->get_ethtool_phy_stats(dev, &stats, data); + } + } else { + data = NULL; + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + vfree(data); + return ret; +} + +static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_perm_addr epaddr; + + if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) + return -EFAULT; + + if (epaddr.size < dev->addr_len) + return -ETOOSMALL; + epaddr.size = dev->addr_len; + + if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) + return -EFAULT; + useraddr += sizeof(epaddr); + if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) + return -EFAULT; + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char __user *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, + void (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + actor(dev, edata.data); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char __user *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return actor(dev, edata.data); +} + +static noinline_for_stack int ethtool_flash_device(struct net_device *dev, + char __user *useraddr) +{ + struct ethtool_flash efl; + + if (copy_from_user(&efl, useraddr, sizeof(efl))) + return -EFAULT; + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + + if (!dev->ethtool_ops->flash_device) + return devlink_compat_flash_update(dev, efl.data); + + return dev->ethtool_ops->flash_device(dev, &efl); +} + +static int ethtool_set_dump(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_dump dump; + + if (!dev->ethtool_ops->set_dump) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + return dev->ethtool_ops->set_dump(dev, &dump); +} + +static int ethtool_get_dump_flag(struct net_device *dev, + void __user *useraddr) +{ + int ret; + struct ethtool_dump dump; + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + ret = ops->get_dump_flag(dev, &dump); + if (ret) + return ret; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) + return -EFAULT; + return 0; +} + +static int ethtool_get_dump_data(struct net_device *dev, + void __user *useraddr) +{ + int ret; + __u32 len; + struct ethtool_dump dump, tmp; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data = NULL; + + if (!ops->get_dump_data || !ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + memset(&tmp, 0, sizeof(tmp)); + tmp.cmd = ETHTOOL_GET_DUMP_FLAG; + ret = ops->get_dump_flag(dev, &tmp); + if (ret) + return ret; + + len = min(tmp.len, dump.len); + if (!len) + return -EFAULT; + + /* Don't ever let the driver think there's more space available + * than it requested with .get_dump_flag(). + */ + dump.len = len; + + /* Always allocate enough space to hold the whole thing so that the + * driver does not need to check the length and bother with partial + * dumping. + */ + data = vzalloc(tmp.len); + if (!data) + return -ENOMEM; + ret = ops->get_dump_data(dev, &dump, data); + if (ret) + goto out; + + /* There are two sane possibilities: + * 1. The driver's .get_dump_data() does not touch dump.len. + * 2. Or it may set dump.len to how much it really writes, which + * should be tmp.len (or len if it can do a partial dump). + * In any case respond to userspace with the actual length of data + * it's receiving. + */ + WARN_ON(dump.len != len && dump.len != tmp.len); + dump.len = len; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) { + ret = -EFAULT; + goto out; + } + useraddr += offsetof(struct ethtool_dump, data); + if (copy_to_user(useraddr, data, len)) + ret = -EFAULT; +out: + vfree(data); + return ret; +} + +static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) +{ + int err = 0; + struct ethtool_ts_info info; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GET_TS_INFO; + + if (phydev && phydev->drv && phydev->drv->ts_info) { + err = phydev->drv->ts_info(phydev, &info); + } else if (ops->get_ts_info) { + err = ops->get_ts_info(dev, &info); + } else { + info.so_timestamping = + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + info.phc_index = -1; + } + + if (err) + return err; + + if (copy_to_user(useraddr, &info, sizeof(info))) + err = -EFAULT; + + return err; +} + +static int __ethtool_get_module_info(struct net_device *dev, + struct ethtool_modinfo *modinfo) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + + if (dev->sfp_bus) + return sfp_get_module_info(dev->sfp_bus, modinfo); + + if (phydev && phydev->drv && phydev->drv->module_info) + return phydev->drv->module_info(phydev, modinfo); + + if (ops->get_module_info) + return ops->get_module_info(dev, modinfo); + + return -EOPNOTSUPP; +} + +static int ethtool_get_module_info(struct net_device *dev, + void __user *useraddr) +{ + int ret; + struct ethtool_modinfo modinfo; + + if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) + return -EFAULT; + + ret = __ethtool_get_module_info(dev, &modinfo); + if (ret) + return ret; + + if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) + return -EFAULT; + + return 0; +} + +static int __ethtool_get_module_eeprom(struct net_device *dev, + struct ethtool_eeprom *ee, u8 *data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + + if (dev->sfp_bus) + return sfp_get_module_eeprom(dev->sfp_bus, ee, data); + + if (phydev && phydev->drv && phydev->drv->module_eeprom) + return phydev->drv->module_eeprom(phydev, ee, data); + + if (ops->get_module_eeprom) + return ops->get_module_eeprom(dev, ee, data); + + return -EOPNOTSUPP; +} + +static int ethtool_get_module_eeprom(struct net_device *dev, + void __user *useraddr) +{ + int ret; + struct ethtool_modinfo modinfo; + + ret = __ethtool_get_module_info(dev, &modinfo); + if (ret) + return ret; + + return ethtool_get_any_eeprom(dev, useraddr, + __ethtool_get_module_eeprom, + modinfo.eeprom_len); +} + +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (tuna->len != sizeof(u16) || + tuna->type_id != ETHTOOL_TUNABLE_U16) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + useraddr += sizeof(tuna); + data = memdup_user(useraddr, tuna.len); + if (IS_ERR(data)) + return PTR_ERR(data); + ret = ops->set_tunable(dev, &tuna, data); + + kfree(data); + return ret; +} + +static noinline_for_stack int +ethtool_get_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int ret; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if (!dev->ethtool_ops->get_per_queue_coalesce) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, + MAX_NUM_QUEUE); + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + return ret; + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + useraddr += sizeof(coalesce); + } + + return 0; +} + +static noinline_for_stack int +ethtool_set_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int i, ret = 0; + int n_queue; + struct ethtool_coalesce *backup = NULL, *tmp = NULL; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if ((!dev->ethtool_ops->set_per_queue_coalesce) || + (!dev->ethtool_ops->get_per_queue_coalesce)) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); + n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); + tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); + if (ret != 0) + goto roll_back; + + tmp++; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { + ret = -EFAULT; + goto roll_back; + } + + ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + goto roll_back; + + useraddr += sizeof(coalesce); + } + +roll_back: + if (ret != 0) { + tmp = backup; + for_each_set_bit(i, queue_mask, bit) { + dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); + tmp++; + } + } + kfree(backup); + + return ret; +} + +static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, + void __user *useraddr, u32 sub_cmd) +{ + struct ethtool_per_queue_op per_queue_opt; + + if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) + return -EFAULT; + + if (per_queue_opt.sub_command != sub_cmd) + return -EINVAL; + + switch (per_queue_opt.sub_command) { + case ETHTOOL_GCOALESCE: + return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); + case ETHTOOL_SCOALESCE: + return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); + default: + return -EOPNOTSUPP; + }; +} + +static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + case ETHTOOL_PHY_FAST_LINK_DOWN: + if (tuna->len != sizeof(u8) || + tuna->type_id != ETHTOOL_TUNABLE_U8) + return -EINVAL; + break; + case ETHTOOL_PHY_EDPD: + if (tuna->len != sizeof(u16) || + tuna->type_id != ETHTOOL_TUNABLE_U16) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int get_phy_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + struct phy_device *phydev = dev->phydev; + void *data; + + if (!(phydev && phydev->drv && phydev->drv->get_tunable)) + return -EOPNOTSUPP; + + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_phy_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + mutex_lock(&phydev->lock); + ret = phydev->drv->get_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int set_phy_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + struct phy_device *phydev = dev->phydev; + void *data; + + if (!(phydev && phydev->drv && phydev->drv->set_tunable)) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_phy_tunable_valid(&tuna); + if (ret) + return ret; + useraddr += sizeof(tuna); + data = memdup_user(useraddr, tuna.len); + if (IS_ERR(data)) + return PTR_ERR(data); + mutex_lock(&phydev->lock); + ret = phydev->drv->set_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); + + kfree(data); + return ret; +} + +static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM }; + int rc; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + + rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); + if (rc) + return rc; + + if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_fecparam fecparam; + + if (!dev->ethtool_ops->set_fecparam) + return -EOPNOTSUPP; + + if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) + return -EFAULT; + + return dev->ethtool_ops->set_fecparam(dev, &fecparam); +} + +/* The main entry point in this file. Called from net/core/dev_ioctl.c */ + +int dev_ethtool(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void __user *useraddr = ifr->ifr_data; + u32 ethcmd, sub_cmd; + int rc; + netdev_features_t old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + if (ethcmd == ETHTOOL_PERQUEUE) { + if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) + return -EFAULT; + } else { + sub_cmd = ethcmd; + } + /* Allow some commands to be done by anyone */ + switch (sub_cmd) { + case ETHTOOL_GSET: + case ETHTOOL_GDRVINFO: + case ETHTOOL_GMSGLVL: + case ETHTOOL_GLINK: + case ETHTOOL_GCOALESCE: + case ETHTOOL_GRINGPARAM: + case ETHTOOL_GPAUSEPARAM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GTXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GSSET_INFO: + case ETHTOOL_GSTRINGS: + case ETHTOOL_GSTATS: + case ETHTOOL_GPHYSTATS: + case ETHTOOL_GTSO: + case ETHTOOL_GPERMADDR: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + case ETHTOOL_GFLAGS: + case ETHTOOL_GPFLAGS: + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GRXFHINDIR: + case ETHTOOL_GRSSH: + case ETHTOOL_GFEATURES: + case ETHTOOL_GCHANNELS: + case ETHTOOL_GET_TS_INFO: + case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: + case ETHTOOL_PHY_GTUNABLE: + case ETHTOOL_GLINKSETTINGS: + case ETHTOOL_GFECPARAM: + break; + default: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + } + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GSET: + rc = ethtool_get_settings(dev, useraddr); + break; + case ETHTOOL_SSET: + rc = ethtool_set_settings(dev, useraddr); + break; + case ETHTOOL_GDRVINFO: + rc = ethtool_get_drvinfo(dev, useraddr); + break; + case ETHTOOL_GREGS: + rc = ethtool_get_regs(dev, useraddr); + break; + case ETHTOOL_GWOL: + rc = ethtool_get_wol(dev, useraddr); + break; + case ETHTOOL_SWOL: + rc = ethtool_set_wol(dev, useraddr); + break; + case ETHTOOL_GMSGLVL: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_msglevel); + break; + case ETHTOOL_SMSGLVL: + rc = ethtool_set_value_void(dev, useraddr, + dev->ethtool_ops->set_msglevel); + break; + case ETHTOOL_GEEE: + rc = ethtool_get_eee(dev, useraddr); + break; + case ETHTOOL_SEEE: + rc = ethtool_set_eee(dev, useraddr); + break; + case ETHTOOL_NWAY_RST: + rc = ethtool_nway_reset(dev); + break; + case ETHTOOL_GLINK: + rc = ethtool_get_link(dev, useraddr); + break; + case ETHTOOL_GEEPROM: + rc = ethtool_get_eeprom(dev, useraddr); + break; + case ETHTOOL_SEEPROM: + rc = ethtool_set_eeprom(dev, useraddr); + break; + case ETHTOOL_GCOALESCE: + rc = ethtool_get_coalesce(dev, useraddr); + break; + case ETHTOOL_SCOALESCE: + rc = ethtool_set_coalesce(dev, useraddr); + break; + case ETHTOOL_GRINGPARAM: + rc = ethtool_get_ringparam(dev, useraddr); + break; + case ETHTOOL_SRINGPARAM: + rc = ethtool_set_ringparam(dev, useraddr); + break; + case ETHTOOL_GPAUSEPARAM: + rc = ethtool_get_pauseparam(dev, useraddr); + break; + case ETHTOOL_SPAUSEPARAM: + rc = ethtool_set_pauseparam(dev, useraddr); + break; + case ETHTOOL_TEST: + rc = ethtool_self_test(dev, useraddr); + break; + case ETHTOOL_GSTRINGS: + rc = ethtool_get_strings(dev, useraddr); + break; + case ETHTOOL_PHYS_ID: + rc = ethtool_phys_id(dev, useraddr); + break; + case ETHTOOL_GSTATS: + rc = ethtool_get_stats(dev, useraddr); + break; + case ETHTOOL_GPERMADDR: + rc = ethtool_get_perm_addr(dev, useraddr); + break; + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + __ethtool_get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GPFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_priv_flags); + break; + case ETHTOOL_SPFLAGS: + rc = ethtool_set_value(dev, useraddr, + dev->ethtool_ops->set_priv_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_FLASHDEV: + rc = ethtool_flash_device(dev, useraddr); + break; + case ETHTOOL_RESET: + rc = ethtool_reset(dev, useraddr); + break; + case ETHTOOL_GSSET_INFO: + rc = ethtool_get_sset_info(dev, useraddr); + break; + case ETHTOOL_GRXFHINDIR: + rc = ethtool_get_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_SRXFHINDIR: + rc = ethtool_set_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_GRSSH: + rc = ethtool_get_rxfh(dev, useraddr); + break; + case ETHTOOL_SRSSH: + rc = ethtool_set_rxfh(dev, useraddr); + break; + case ETHTOOL_GFEATURES: + rc = ethtool_get_features(dev, useraddr); + break; + case ETHTOOL_SFEATURES: + rc = ethtool_set_features(dev, useraddr); + break; + case ETHTOOL_GTXCSUM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GTSO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + rc = ethtool_get_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_STXCSUM: + case ETHTOOL_SRXCSUM: + case ETHTOOL_SSG: + case ETHTOOL_STSO: + case ETHTOOL_SGSO: + case ETHTOOL_SGRO: + rc = ethtool_set_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + case ETHTOOL_SCHANNELS: + rc = ethtool_set_channels(dev, useraddr); + break; + case ETHTOOL_SET_DUMP: + rc = ethtool_set_dump(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_FLAG: + rc = ethtool_get_dump_flag(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_DATA: + rc = ethtool_get_dump_data(dev, useraddr); + break; + case ETHTOOL_GET_TS_INFO: + rc = ethtool_get_ts_info(dev, useraddr); + break; + case ETHTOOL_GMODULEINFO: + rc = ethtool_get_module_info(dev, useraddr); + break; + case ETHTOOL_GMODULEEEPROM: + rc = ethtool_get_module_eeprom(dev, useraddr); + break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; + case ETHTOOL_GPHYSTATS: + rc = ethtool_get_phy_stats(dev, useraddr); + break; + case ETHTOOL_PERQUEUE: + rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); + break; + case ETHTOOL_GLINKSETTINGS: + rc = ethtool_get_link_ksettings(dev, useraddr); + break; + case ETHTOOL_SLINKSETTINGS: + rc = ethtool_set_link_ksettings(dev, useraddr); + break; + case ETHTOOL_PHY_GTUNABLE: + rc = get_phy_tunable(dev, useraddr); + break; + case ETHTOOL_PHY_STUNABLE: + rc = set_phy_tunable(dev, useraddr); + break; + case ETHTOOL_GFECPARAM: + rc = ethtool_get_fecparam(dev, useraddr); + break; + case ETHTOOL_SFECPARAM: + rc = ethtool_set_fecparam(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} + +struct ethtool_rx_flow_key { + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; + struct flow_dissector_key_ip ip; + struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_eth_addrs eth_addrs; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct ethtool_rx_flow_match { + struct flow_dissector dissector; + struct ethtool_rx_flow_key key; + struct ethtool_rx_flow_key mask; +}; + +struct ethtool_rx_flow_rule * +ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) +{ + const struct ethtool_rx_flow_spec *fs = input->fs; + static struct in6_addr zero_addr = {}; + struct ethtool_rx_flow_match *match; + struct ethtool_rx_flow_rule *flow; + struct flow_action_entry *act; + + flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + + sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + /* ethtool_rx supports only one single action per rule. */ + flow->rule = flow_rule_alloc(1); + if (!flow->rule) { + kfree(flow); + return ERR_PTR(-ENOMEM); + } + + match = (struct ethtool_rx_flow_match *)flow->priv; + flow->rule->match.dissector = &match->dissector; + flow->rule->match.mask = &match->mask; + flow->rule->match.key = &match->key; + + match->mask.basic.n_proto = htons(0xffff); + + switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case ETHER_FLOW: { + const struct ethhdr *ether_spec, *ether_m_spec; + + ether_spec = &fs->h_u.ether_spec; + ether_m_spec = &fs->m_u.ether_spec; + + if (!is_zero_ether_addr(ether_m_spec->h_source)) { + ether_addr_copy(match->key.eth_addrs.src, + ether_spec->h_source); + ether_addr_copy(match->mask.eth_addrs.src, + ether_m_spec->h_source); + } + if (!is_zero_ether_addr(ether_m_spec->h_dest)) { + ether_addr_copy(match->key.eth_addrs.dst, + ether_spec->h_dest); + ether_addr_copy(match->mask.eth_addrs.dst, + ether_m_spec->h_dest); + } + if (ether_m_spec->h_proto) { + match->key.basic.n_proto = ether_spec->h_proto; + match->mask.basic.n_proto = ether_m_spec->h_proto; + } + } + break; + case TCP_V4_FLOW: + case UDP_V4_FLOW: { + const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; + + match->key.basic.n_proto = htons(ETH_P_IP); + + v4_spec = &fs->h_u.tcp_ip4_spec; + v4_m_spec = &fs->m_u.tcp_ip4_spec; + + if (v4_m_spec->ip4src) { + match->key.ipv4.src = v4_spec->ip4src; + match->mask.ipv4.src = v4_m_spec->ip4src; + } + if (v4_m_spec->ip4dst) { + match->key.ipv4.dst = v4_spec->ip4dst; + match->mask.ipv4.dst = v4_m_spec->ip4dst; + } + if (v4_m_spec->ip4src || + v4_m_spec->ip4dst) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); + match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = + offsetof(struct ethtool_rx_flow_key, ipv4); + } + if (v4_m_spec->psrc) { + match->key.tp.src = v4_spec->psrc; + match->mask.tp.src = v4_m_spec->psrc; + } + if (v4_m_spec->pdst) { + match->key.tp.dst = v4_spec->pdst; + match->mask.tp.dst = v4_m_spec->pdst; + } + if (v4_m_spec->psrc || + v4_m_spec->pdst) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = + offsetof(struct ethtool_rx_flow_key, tp); + } + if (v4_m_spec->tos) { + match->key.ip.tos = v4_spec->tos; + match->mask.ip.tos = v4_m_spec->tos; + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IP); + match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = + offsetof(struct ethtool_rx_flow_key, ip); + } + } + break; + case TCP_V6_FLOW: + case UDP_V6_FLOW: { + const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; + + match->key.basic.n_proto = htons(ETH_P_IPV6); + + v6_spec = &fs->h_u.tcp_ip6_spec; + v6_m_spec = &fs->m_u.tcp_ip6_spec; + if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { + memcpy(&match->key.ipv6.src, v6_spec->ip6src, + sizeof(match->key.ipv6.src)); + memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, + sizeof(match->mask.ipv6.src)); + } + if (memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) { + memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, + sizeof(match->key.ipv6.dst)); + memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, + sizeof(match->mask.ipv6.dst)); + } + if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) || + memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); + match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = + offsetof(struct ethtool_rx_flow_key, ipv6); + } + if (v6_m_spec->psrc) { + match->key.tp.src = v6_spec->psrc; + match->mask.tp.src = v6_m_spec->psrc; + } + if (v6_m_spec->pdst) { + match->key.tp.dst = v6_spec->pdst; + match->mask.tp.dst = v6_m_spec->pdst; + } + if (v6_m_spec->psrc || + v6_m_spec->pdst) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = + offsetof(struct ethtool_rx_flow_key, tp); + } + if (v6_m_spec->tclass) { + match->key.ip.tos = v6_spec->tclass; + match->mask.ip.tos = v6_m_spec->tclass; + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IP); + match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = + offsetof(struct ethtool_rx_flow_key, ip); + } + } + break; + default: + ethtool_rx_flow_rule_destroy(flow); + return ERR_PTR(-EINVAL); + } + + switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + match->key.basic.ip_proto = IPPROTO_TCP; + break; + case UDP_V4_FLOW: + case UDP_V6_FLOW: + match->key.basic.ip_proto = IPPROTO_UDP; + break; + } + match->mask.basic.ip_proto = 0xff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = + offsetof(struct ethtool_rx_flow_key, basic); + + if (fs->flow_type & FLOW_EXT) { + const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; + const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; + + if (ext_m_spec->vlan_etype) { + match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; + match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; + } + + if (ext_m_spec->vlan_tci) { + match->key.vlan.vlan_id = + ntohs(ext_h_spec->vlan_tci) & 0x0fff; + match->mask.vlan.vlan_id = + ntohs(ext_m_spec->vlan_tci) & 0x0fff; + + match->key.vlan.vlan_dei = + !!(ext_h_spec->vlan_tci & htons(0x1000)); + match->mask.vlan.vlan_dei = + !!(ext_m_spec->vlan_tci & htons(0x1000)); + + match->key.vlan.vlan_priority = + (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; + match->mask.vlan.vlan_priority = + (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; + } + + if (ext_m_spec->vlan_etype || + ext_m_spec->vlan_tci) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_VLAN); + match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = + offsetof(struct ethtool_rx_flow_key, vlan); + } + } + if (fs->flow_type & FLOW_MAC_EXT) { + const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; + const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; + + memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, + ETH_ALEN); + memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, + ETH_ALEN); + + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); + match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = + offsetof(struct ethtool_rx_flow_key, eth_addrs); + } + + act = &flow->rule->action.entries[0]; + switch (fs->ring_cookie) { + case RX_CLS_FLOW_DISC: + act->id = FLOW_ACTION_DROP; + break; + case RX_CLS_FLOW_WAKE: + act->id = FLOW_ACTION_WAKE; + break; + default: + act->id = FLOW_ACTION_QUEUE; + if (fs->flow_type & FLOW_RSS) + act->queue.ctx = input->rss_ctx; + + act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); + act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); + break; + } + + return flow; +} +EXPORT_SYMBOL(ethtool_rx_flow_rule_create); + +void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) +{ + kfree(flow->rule); + kfree(flow); +} +EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy); -- cgit v1.2.3 From d44e13108b6da4e1c4778bd654a470208e02ef37 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 11 Dec 2019 10:58:29 +0100 Subject: ethtool: move string arrays into common file Introduce file net/ethtool/common.c for code shared by ioctl and netlink ethtool interface. Move name tables of features, RSS hash functions, tunables and PHY tunables into this file. Signed-off-by: Michal Kubecek Reviewed-by: Jiri Pirko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/ethtool/Makefile | 2 +- net/ethtool/common.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/common.h | 17 +++++++++++ net/ethtool/ioctl.c | 84 ++------------------------------------------------- 4 files changed, 105 insertions(+), 83 deletions(-) create mode 100644 net/ethtool/common.c create mode 100644 net/ethtool/common.h (limited to 'net') diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 7e5c9eb85c90..f68387618973 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += ioctl.o +obj-y += ioctl.o common.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c new file mode 100644 index 000000000000..8b5e11e7e0a6 --- /dev/null +++ b/net/ethtool/common.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "common.h" + +const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", + [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", + [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", + [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", + [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", + [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", + [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", + [NETIF_F_RXFCS_BIT] = "rx-fcs", + [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", + [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", + [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", + [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", + [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", +}; + +const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", + [ETH_RSS_HASH_CRC32_BIT] = "crc32", +}; + +const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", +}; + +const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", + [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", + [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", +}; diff --git a/net/ethtool/common.h b/net/ethtool/common.h new file mode 100644 index 000000000000..336566430be4 --- /dev/null +++ b/net/ethtool/common.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ETHTOOL_COMMON_H +#define _ETHTOOL_COMMON_H + +#include + +extern const char +netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; +extern const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN]; +extern const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; +extern const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; + +#endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index cd9bc67381b2..b262db5a1d91 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -27,6 +27,8 @@ #include #include +#include "common.h" + /* * Some useful ethtool_ops methods that're device independent. * If we find that all drivers want to do the same thing here, @@ -54,88 +56,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info); #define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) -static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { - [NETIF_F_SG_BIT] = "tx-scatter-gather", - [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", - [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", - [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", - [NETIF_F_HIGHDMA_BIT] = "highdma", - [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", - [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", - - [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", - [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", - [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", - [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", - [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", - [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", - [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", - [NETIF_F_GRO_BIT] = "rx-gro", - [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", - [NETIF_F_LRO_BIT] = "rx-lro", - - [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", - [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", - [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", - [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", - [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", - [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", - [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", - [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", - [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", - [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", - [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", - [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", - [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", - [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", - - [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", - [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", - [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", - [NETIF_F_RXHASH_BIT] = "rx-hashing", - [NETIF_F_RXCSUM_BIT] = "rx-checksum", - [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", - [NETIF_F_LOOPBACK_BIT] = "loopback", - [NETIF_F_RXFCS_BIT] = "rx-fcs", - [NETIF_F_RXALL_BIT] = "rx-all", - [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", - [NETIF_F_HW_TC_BIT] = "hw-tc-offload", - [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", - [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", - [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", - [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", - [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", - [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", -}; - -static const char -rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { - [ETH_RSS_HASH_TOP_BIT] = "toeplitz", - [ETH_RSS_HASH_XOR_BIT] = "xor", - [ETH_RSS_HASH_CRC32_BIT] = "crc32", -}; - -static const char -tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", - [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", - [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", -}; - -static const char -phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { - [ETHTOOL_ID_UNSPEC] = "Unspec", - [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", - [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", - [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", -}; - static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { -- cgit v1.2.3 From 428c122f5f6b54f40bd51c47495104b534b5a57c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 11 Dec 2019 10:58:34 +0100 Subject: ethtool: provide link mode names as a string set Unlike e.g. netdev features, the ethtool ioctl interface requires link mode table to be in sync between kernel and userspace for userspace to be able to display and set all link modes supported by kernel. The way arbitrary length bitsets are implemented in netlink interface, this will be no longer needed. To allow userspace to access all link modes running kernel supports, add table of ethernet link mode names and make it available as a string set to userspace GET_STRSET requests. Add build time check to make sure names are defined for all modes declared in enum ethtool_link_mode_bit_indices. Once the string set is available, make it also accessible via ioctl. Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Reviewed-by: Jiri Pirko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 ++ net/ethtool/common.c | 86 ++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/common.h | 5 +++ net/ethtool/ioctl.c | 6 ++++ 4 files changed, 99 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d4591792f0b4..f44155840b07 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -593,6 +593,7 @@ struct ethtool_pauseparam { * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS * @ETH_SS_PHY_TUNABLES: PHY tunable names + * @ETH_SS_LINK_MODES: link mode names */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -604,6 +605,7 @@ enum ethtool_stringset { ETH_SS_TUNABLES, ETH_SS_PHY_STATS, ETH_SS_PHY_TUNABLES, + ETH_SS_LINK_MODES, }; /** diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 8b5e11e7e0a6..0a8728565356 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -83,3 +83,89 @@ phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down", }; + +#define __LINK_MODE_NAME(speed, type, duplex) \ + #speed "base" #type "/" #duplex +#define __DEFINE_LINK_MODE_NAME(speed, type, duplex) \ + [ETHTOOL_LINK_MODE(speed, type, duplex)] = \ + __LINK_MODE_NAME(speed, type, duplex) +#define __DEFINE_SPECIAL_MODE_NAME(_mode, _name) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = _name + +const char link_mode_names[][ETH_GSTRING_LEN] = { + __DEFINE_LINK_MODE_NAME(10, T, Half), + __DEFINE_LINK_MODE_NAME(10, T, Full), + __DEFINE_LINK_MODE_NAME(100, T, Half), + __DEFINE_LINK_MODE_NAME(100, T, Full), + __DEFINE_LINK_MODE_NAME(1000, T, Half), + __DEFINE_LINK_MODE_NAME(1000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(Autoneg, "Autoneg"), + __DEFINE_SPECIAL_MODE_NAME(TP, "TP"), + __DEFINE_SPECIAL_MODE_NAME(AUI, "AUI"), + __DEFINE_SPECIAL_MODE_NAME(MII, "MII"), + __DEFINE_SPECIAL_MODE_NAME(FIBRE, "FIBRE"), + __DEFINE_SPECIAL_MODE_NAME(BNC, "BNC"), + __DEFINE_LINK_MODE_NAME(10000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(Pause, "Pause"), + __DEFINE_SPECIAL_MODE_NAME(Asym_Pause, "Asym_Pause"), + __DEFINE_LINK_MODE_NAME(2500, X, Full), + __DEFINE_SPECIAL_MODE_NAME(Backplane, "Backplane"), + __DEFINE_LINK_MODE_NAME(1000, KX, Full), + __DEFINE_LINK_MODE_NAME(10000, KX4, Full), + __DEFINE_LINK_MODE_NAME(10000, KR, Full), + __DEFINE_SPECIAL_MODE_NAME(10000baseR_FEC, "10000baseR_FEC"), + __DEFINE_LINK_MODE_NAME(20000, MLD2, Full), + __DEFINE_LINK_MODE_NAME(20000, KR2, Full), + __DEFINE_LINK_MODE_NAME(40000, KR4, Full), + __DEFINE_LINK_MODE_NAME(40000, CR4, Full), + __DEFINE_LINK_MODE_NAME(40000, SR4, Full), + __DEFINE_LINK_MODE_NAME(40000, LR4, Full), + __DEFINE_LINK_MODE_NAME(56000, KR4, Full), + __DEFINE_LINK_MODE_NAME(56000, CR4, Full), + __DEFINE_LINK_MODE_NAME(56000, SR4, Full), + __DEFINE_LINK_MODE_NAME(56000, LR4, Full), + __DEFINE_LINK_MODE_NAME(25000, CR, Full), + __DEFINE_LINK_MODE_NAME(25000, KR, Full), + __DEFINE_LINK_MODE_NAME(25000, SR, Full), + __DEFINE_LINK_MODE_NAME(50000, CR2, Full), + __DEFINE_LINK_MODE_NAME(50000, KR2, Full), + __DEFINE_LINK_MODE_NAME(100000, KR4, Full), + __DEFINE_LINK_MODE_NAME(100000, SR4, Full), + __DEFINE_LINK_MODE_NAME(100000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_NAME(50000, SR2, Full), + __DEFINE_LINK_MODE_NAME(1000, X, Full), + __DEFINE_LINK_MODE_NAME(10000, CR, Full), + __DEFINE_LINK_MODE_NAME(10000, SR, Full), + __DEFINE_LINK_MODE_NAME(10000, LR, Full), + __DEFINE_LINK_MODE_NAME(10000, LRM, Full), + __DEFINE_LINK_MODE_NAME(10000, ER, Full), + __DEFINE_LINK_MODE_NAME(2500, T, Full), + __DEFINE_LINK_MODE_NAME(5000, T, Full), + __DEFINE_SPECIAL_MODE_NAME(FEC_NONE, "None"), + __DEFINE_SPECIAL_MODE_NAME(FEC_RS, "RS"), + __DEFINE_SPECIAL_MODE_NAME(FEC_BASER, "BASER"), + __DEFINE_LINK_MODE_NAME(50000, KR, Full), + __DEFINE_LINK_MODE_NAME(50000, SR, Full), + __DEFINE_LINK_MODE_NAME(50000, CR, Full), + __DEFINE_LINK_MODE_NAME(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(50000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, KR2, Full), + __DEFINE_LINK_MODE_NAME(100000, SR2, Full), + __DEFINE_LINK_MODE_NAME(100000, CR2, Full), + __DEFINE_LINK_MODE_NAME(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(100000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, KR4, Full), + __DEFINE_LINK_MODE_NAME(200000, SR4, Full), + __DEFINE_LINK_MODE_NAME(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(200000, DR4, Full), + __DEFINE_LINK_MODE_NAME(200000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100, T1, Full), + __DEFINE_LINK_MODE_NAME(1000, T1, Full), + __DEFINE_LINK_MODE_NAME(400000, KR8, Full), + __DEFINE_LINK_MODE_NAME(400000, SR8, Full), + __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_NAME(400000, DR8, Full), + __DEFINE_LINK_MODE_NAME(400000, CR8, Full), +}; +static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 336566430be4..bbb788908cb1 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -5,6 +5,10 @@ #include +/* compose link mode index from speed, type and duplex */ +#define ETHTOOL_LINK_MODE(speed, type, duplex) \ + ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT + extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; extern const char @@ -13,5 +17,6 @@ extern const char tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; +extern const char link_mode_names[][ETH_GSTRING_LEN]; #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index b262db5a1d91..aed2c2cf1623 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -154,6 +154,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) !ops->get_ethtool_phy_stats) return phy_ethtool_get_sset_count(dev->phydev); + if (sset == ETH_SS_LINK_MODES) + return __ETHTOOL_LINK_MODE_MASK_NBITS; + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -178,6 +181,9 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) phy_ethtool_get_strings(dev->phydev, data); + else if (stringset == ETH_SS_LINK_MODES) + memcpy(data, link_mode_names, + __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); -- cgit v1.2.3 From 0290bd291cc0e0488e35e66bf39efcd7d9d9122b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 10 Dec 2019 09:23:51 -0500 Subject: netdev: pass the stuck queue to the timeout handler This allows incrementing the correct timeout statistic without any mess. Down the road, devices can learn to reset just the specific queue. The patch was generated with the following script: use strict; use warnings; our $^I = '.bak'; my @work = ( ["arch/m68k/emu/nfeth.c", "nfeth_tx_timeout"], ["arch/um/drivers/net_kern.c", "uml_net_tx_timeout"], ["arch/um/drivers/vector_kern.c", "vector_net_tx_timeout"], ["arch/xtensa/platforms/iss/network.c", "iss_net_tx_timeout"], ["drivers/char/pcmcia/synclink_cs.c", "hdlcdev_tx_timeout"], ["drivers/infiniband/ulp/ipoib/ipoib_main.c", "ipoib_timeout"], ["drivers/infiniband/ulp/ipoib/ipoib_main.c", "ipoib_timeout"], ["drivers/message/fusion/mptlan.c", "mpt_lan_tx_timeout"], ["drivers/misc/sgi-xp/xpnet.c", "xpnet_dev_tx_timeout"], ["drivers/net/appletalk/cops.c", "cops_timeout"], ["drivers/net/arcnet/arcdevice.h", "arcnet_timeout"], ["drivers/net/arcnet/arcnet.c", "arcnet_timeout"], ["drivers/net/arcnet/com20020.c", "arcnet_timeout"], ["drivers/net/ethernet/3com/3c509.c", "el3_tx_timeout"], ["drivers/net/ethernet/3com/3c515.c", "corkscrew_timeout"], ["drivers/net/ethernet/3com/3c574_cs.c", "el3_tx_timeout"], ["drivers/net/ethernet/3com/3c589_cs.c", "el3_tx_timeout"], ["drivers/net/ethernet/3com/3c59x.c", "vortex_tx_timeout"], ["drivers/net/ethernet/3com/3c59x.c", "vortex_tx_timeout"], ["drivers/net/ethernet/3com/typhoon.c", "typhoon_tx_timeout"], ["drivers/net/ethernet/8390/8390.h", "ei_tx_timeout"], ["drivers/net/ethernet/8390/8390.h", "eip_tx_timeout"], ["drivers/net/ethernet/8390/8390.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/8390p.c", "eip_tx_timeout"], ["drivers/net/ethernet/8390/ax88796.c", "ax_ei_tx_timeout"], ["drivers/net/ethernet/8390/axnet_cs.c", "axnet_tx_timeout"], ["drivers/net/ethernet/8390/etherh.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/hydra.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/mac8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/mcf8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/lib8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/ne2k-pci.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/pcnet_cs.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/smc-ultra.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/wd.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/zorro8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/adaptec/starfire.c", "tx_timeout"], ["drivers/net/ethernet/agere/et131x.c", "et131x_tx_timeout"], ["drivers/net/ethernet/allwinner/sun4i-emac.c", "emac_timeout"], ["drivers/net/ethernet/alteon/acenic.c", "ace_watchdog"], ["drivers/net/ethernet/amazon/ena/ena_netdev.c", "ena_tx_timeout"], ["drivers/net/ethernet/amd/7990.h", "lance_tx_timeout"], ["drivers/net/ethernet/amd/7990.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/a2065.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/am79c961a.c", "am79c961_timeout"], ["drivers/net/ethernet/amd/amd8111e.c", "amd8111e_tx_timeout"], ["drivers/net/ethernet/amd/ariadne.c", "ariadne_tx_timeout"], ["drivers/net/ethernet/amd/atarilance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/au1000_eth.c", "au1000_tx_timeout"], ["drivers/net/ethernet/amd/declance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/lance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/mvme147.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/ni65.c", "ni65_timeout"], ["drivers/net/ethernet/amd/nmclan_cs.c", "mace_tx_timeout"], ["drivers/net/ethernet/amd/pcnet32.c", "pcnet32_tx_timeout"], ["drivers/net/ethernet/amd/sunlance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/xgbe/xgbe-drv.c", "xgbe_tx_timeout"], ["drivers/net/ethernet/apm/xgene-v2/main.c", "xge_timeout"], ["drivers/net/ethernet/apm/xgene/xgene_enet_main.c", "xgene_enet_timeout"], ["drivers/net/ethernet/apple/macmace.c", "mace_tx_timeout"], ["drivers/net/ethernet/atheros/ag71xx.c", "ag71xx_tx_timeout"], ["drivers/net/ethernet/atheros/alx/main.c", "alx_tx_timeout"], ["drivers/net/ethernet/atheros/atl1c/atl1c_main.c", "atl1c_tx_timeout"], ["drivers/net/ethernet/atheros/atl1e/atl1e_main.c", "atl1e_tx_timeout"], ["drivers/net/ethernet/atheros/atlx/atl.c", "atlx_tx_timeout"], ["drivers/net/ethernet/atheros/atlx/atl1.c", "atlx_tx_timeout"], ["drivers/net/ethernet/atheros/atlx/atl2.c", "atl2_tx_timeout"], ["drivers/net/ethernet/broadcom/b44.c", "b44_tx_timeout"], ["drivers/net/ethernet/broadcom/bcmsysport.c", "bcm_sysport_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2.c", "bnx2_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h", "bnx2x_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c", "bnx2x_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c", "bnx2x_tx_timeout"], ["drivers/net/ethernet/broadcom/bnxt/bnxt.c", "bnxt_tx_timeout"], ["drivers/net/ethernet/broadcom/genet/bcmgenet.c", "bcmgenet_timeout"], ["drivers/net/ethernet/broadcom/sb1250-mac.c", "sbmac_tx_timeout"], ["drivers/net/ethernet/broadcom/tg3.c", "tg3_tx_timeout"], ["drivers/net/ethernet/calxeda/xgmac.c", "xgmac_tx_timeout"], ["drivers/net/ethernet/cavium/liquidio/lio_main.c", "liquidio_tx_timeout"], ["drivers/net/ethernet/cavium/liquidio/lio_vf_main.c", "liquidio_tx_timeout"], ["drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c", "lio_vf_rep_tx_timeout"], ["drivers/net/ethernet/cavium/thunder/nicvf_main.c", "nicvf_tx_timeout"], ["drivers/net/ethernet/cirrus/cs89x0.c", "net_timeout"], ["drivers/net/ethernet/cisco/enic/enic_main.c", "enic_tx_timeout"], ["drivers/net/ethernet/cisco/enic/enic_main.c", "enic_tx_timeout"], ["drivers/net/ethernet/cortina/gemini.c", "gmac_tx_timeout"], ["drivers/net/ethernet/davicom/dm9000.c", "dm9000_timeout"], ["drivers/net/ethernet/dec/tulip/de2104x.c", "de_tx_timeout"], ["drivers/net/ethernet/dec/tulip/tulip_core.c", "tulip_tx_timeout"], ["drivers/net/ethernet/dec/tulip/winbond-840.c", "tx_timeout"], ["drivers/net/ethernet/dlink/dl2k.c", "rio_tx_timeout"], ["drivers/net/ethernet/dlink/sundance.c", "tx_timeout"], ["drivers/net/ethernet/emulex/benet/be_main.c", "be_tx_timeout"], ["drivers/net/ethernet/ethoc.c", "ethoc_tx_timeout"], ["drivers/net/ethernet/faraday/ftgmac100.c", "ftgmac100_tx_timeout"], ["drivers/net/ethernet/fealnx.c", "fealnx_tx_timeout"], ["drivers/net/ethernet/freescale/dpaa/dpaa_eth.c", "dpaa_tx_timeout"], ["drivers/net/ethernet/freescale/fec_main.c", "fec_timeout"], ["drivers/net/ethernet/freescale/fec_mpc52xx.c", "mpc52xx_fec_tx_timeout"], ["drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c", "fs_timeout"], ["drivers/net/ethernet/freescale/gianfar.c", "gfar_timeout"], ["drivers/net/ethernet/freescale/ucc_geth.c", "ucc_geth_timeout"], ["drivers/net/ethernet/fujitsu/fmvj18x_cs.c", "fjn_tx_timeout"], ["drivers/net/ethernet/google/gve/gve_main.c", "gve_tx_timeout"], ["drivers/net/ethernet/hisilicon/hip04_eth.c", "hip04_timeout"], ["drivers/net/ethernet/hisilicon/hix5hd2_gmac.c", "hix5hd2_net_timeout"], ["drivers/net/ethernet/hisilicon/hns/hns_enet.c", "hns_nic_net_timeout"], ["drivers/net/ethernet/hisilicon/hns3/hns3_enet.c", "hns3_nic_net_timeout"], ["drivers/net/ethernet/huawei/hinic/hinic_main.c", "hinic_tx_timeout"], ["drivers/net/ethernet/i825xx/82596.c", "i596_tx_timeout"], ["drivers/net/ethernet/i825xx/ether1.c", "ether1_timeout"], ["drivers/net/ethernet/i825xx/lib82596.c", "i596_tx_timeout"], ["drivers/net/ethernet/i825xx/sun3_82586.c", "sun3_82586_timeout"], ["drivers/net/ethernet/ibm/ehea/ehea_main.c", "ehea_tx_watchdog"], ["drivers/net/ethernet/ibm/emac/core.c", "emac_tx_timeout"], ["drivers/net/ethernet/ibm/emac/core.c", "emac_tx_timeout"], ["drivers/net/ethernet/ibm/ibmvnic.c", "ibmvnic_tx_timeout"], ["drivers/net/ethernet/intel/e100.c", "e100_tx_timeout"], ["drivers/net/ethernet/intel/e1000/e1000_main.c", "e1000_tx_timeout"], ["drivers/net/ethernet/intel/e1000e/netdev.c", "e1000_tx_timeout"], ["drivers/net/ethernet/intel/fm10k/fm10k_netdev.c", "fm10k_tx_timeout"], ["drivers/net/ethernet/intel/i40e/i40e_main.c", "i40e_tx_timeout"], ["drivers/net/ethernet/intel/iavf/iavf_main.c", "iavf_tx_timeout"], ["drivers/net/ethernet/intel/ice/ice_main.c", "ice_tx_timeout"], ["drivers/net/ethernet/intel/ice/ice_main.c", "ice_tx_timeout"], ["drivers/net/ethernet/intel/igb/igb_main.c", "igb_tx_timeout"], ["drivers/net/ethernet/intel/igbvf/netdev.c", "igbvf_tx_timeout"], ["drivers/net/ethernet/intel/ixgb/ixgb_main.c", "ixgb_tx_timeout"], ["drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c", "adapter->netdev->netdev_ops->ndo_tx_timeout(adapter->netdev);"], ["drivers/net/ethernet/intel/ixgbe/ixgbe_main.c", "ixgbe_tx_timeout"], ["drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c", "ixgbevf_tx_timeout"], ["drivers/net/ethernet/jme.c", "jme_tx_timeout"], ["drivers/net/ethernet/korina.c", "korina_tx_timeout"], ["drivers/net/ethernet/lantiq_etop.c", "ltq_etop_tx_timeout"], ["drivers/net/ethernet/marvell/mv643xx_eth.c", "mv643xx_eth_tx_timeout"], ["drivers/net/ethernet/marvell/pxa168_eth.c", "pxa168_eth_tx_timeout"], ["drivers/net/ethernet/marvell/skge.c", "skge_tx_timeout"], ["drivers/net/ethernet/marvell/sky2.c", "sky2_tx_timeout"], ["drivers/net/ethernet/marvell/sky2.c", "sky2_tx_timeout"], ["drivers/net/ethernet/mediatek/mtk_eth_soc.c", "mtk_tx_timeout"], ["drivers/net/ethernet/mellanox/mlx4/en_netdev.c", "mlx4_en_tx_timeout"], ["drivers/net/ethernet/mellanox/mlx4/en_netdev.c", "mlx4_en_tx_timeout"], ["drivers/net/ethernet/mellanox/mlx5/core/en_main.c", "mlx5e_tx_timeout"], ["drivers/net/ethernet/micrel/ks8842.c", "ks8842_tx_timeout"], ["drivers/net/ethernet/micrel/ksz884x.c", "netdev_tx_timeout"], ["drivers/net/ethernet/microchip/enc28j60.c", "enc28j60_tx_timeout"], ["drivers/net/ethernet/microchip/encx24j600.c", "encx24j600_tx_timeout"], ["drivers/net/ethernet/natsemi/sonic.h", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/sonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/jazzsonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/macsonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/natsemi.c", "ns_tx_timeout"], ["drivers/net/ethernet/natsemi/ns83820.c", "ns83820_tx_timeout"], ["drivers/net/ethernet/natsemi/xtsonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/neterion/s2io.h", "s2io_tx_watchdog"], ["drivers/net/ethernet/neterion/s2io.c", "s2io_tx_watchdog"], ["drivers/net/ethernet/neterion/vxge/vxge-main.c", "vxge_tx_watchdog"], ["drivers/net/ethernet/netronome/nfp/nfp_net_common.c", "nfp_net_tx_timeout"], ["drivers/net/ethernet/nvidia/forcedeth.c", "nv_tx_timeout"], ["drivers/net/ethernet/nvidia/forcedeth.c", "nv_tx_timeout"], ["drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c", "pch_gbe_tx_timeout"], ["drivers/net/ethernet/packetengines/hamachi.c", "hamachi_tx_timeout"], ["drivers/net/ethernet/packetengines/yellowfin.c", "yellowfin_tx_timeout"], ["drivers/net/ethernet/pensando/ionic/ionic_lif.c", "ionic_tx_timeout"], ["drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c", "netxen_tx_timeout"], ["drivers/net/ethernet/qlogic/qla3xxx.c", "ql3xxx_tx_timeout"], ["drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c", "qlcnic_tx_timeout"], ["drivers/net/ethernet/qualcomm/emac/emac.c", "emac_tx_timeout"], ["drivers/net/ethernet/qualcomm/qca_spi.c", "qcaspi_netdev_tx_timeout"], ["drivers/net/ethernet/qualcomm/qca_uart.c", "qcauart_netdev_tx_timeout"], ["drivers/net/ethernet/rdc/r6040.c", "r6040_tx_timeout"], ["drivers/net/ethernet/realtek/8139cp.c", "cp_tx_timeout"], ["drivers/net/ethernet/realtek/8139too.c", "rtl8139_tx_timeout"], ["drivers/net/ethernet/realtek/atp.c", "tx_timeout"], ["drivers/net/ethernet/realtek/r8169_main.c", "rtl8169_tx_timeout"], ["drivers/net/ethernet/renesas/ravb_main.c", "ravb_tx_timeout"], ["drivers/net/ethernet/renesas/sh_eth.c", "sh_eth_tx_timeout"], ["drivers/net/ethernet/renesas/sh_eth.c", "sh_eth_tx_timeout"], ["drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c", "sxgbe_tx_timeout"], ["drivers/net/ethernet/seeq/ether3.c", "ether3_timeout"], ["drivers/net/ethernet/seeq/sgiseeq.c", "timeout"], ["drivers/net/ethernet/sfc/efx.c", "efx_watchdog"], ["drivers/net/ethernet/sfc/falcon/efx.c", "ef4_watchdog"], ["drivers/net/ethernet/sgi/ioc3-eth.c", "ioc3_timeout"], ["drivers/net/ethernet/sgi/meth.c", "meth_tx_timeout"], ["drivers/net/ethernet/silan/sc92031.c", "sc92031_tx_timeout"], ["drivers/net/ethernet/sis/sis190.c", "sis190_tx_timeout"], ["drivers/net/ethernet/sis/sis900.c", "sis900_tx_timeout"], ["drivers/net/ethernet/smsc/epic100.c", "epic_tx_timeout"], ["drivers/net/ethernet/smsc/smc911x.c", "smc911x_timeout"], ["drivers/net/ethernet/smsc/smc9194.c", "smc_timeout"], ["drivers/net/ethernet/smsc/smc91c92_cs.c", "smc_tx_timeout"], ["drivers/net/ethernet/smsc/smc91x.c", "smc_timeout"], ["drivers/net/ethernet/stmicro/stmmac/stmmac_main.c", "stmmac_tx_timeout"], ["drivers/net/ethernet/sun/cassini.c", "cas_tx_timeout"], ["drivers/net/ethernet/sun/ldmvsw.c", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/sun/niu.c", "niu_tx_timeout"], ["drivers/net/ethernet/sun/sunbmac.c", "bigmac_tx_timeout"], ["drivers/net/ethernet/sun/sungem.c", "gem_tx_timeout"], ["drivers/net/ethernet/sun/sunhme.c", "happy_meal_tx_timeout"], ["drivers/net/ethernet/sun/sunqe.c", "qe_tx_timeout"], ["drivers/net/ethernet/sun/sunvnet.c", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/sun/sunvnet_common.c", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/sun/sunvnet_common.h", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/synopsys/dwc-xlgmac-net.c", "xlgmac_tx_timeout"], ["drivers/net/ethernet/ti/cpmac.c", "cpmac_tx_timeout"], ["drivers/net/ethernet/ti/cpsw.c", "cpsw_ndo_tx_timeout"], ["drivers/net/ethernet/ti/cpsw_priv.c", "cpsw_ndo_tx_timeout"], ["drivers/net/ethernet/ti/cpsw_priv.h", "cpsw_ndo_tx_timeout"], ["drivers/net/ethernet/ti/davinci_emac.c", "emac_dev_tx_timeout"], ["drivers/net/ethernet/ti/netcp_core.c", "netcp_ndo_tx_timeout"], ["drivers/net/ethernet/ti/tlan.c", "tlan_tx_timeout"], ["drivers/net/ethernet/toshiba/ps3_gelic_net.h", "gelic_net_tx_timeout"], ["drivers/net/ethernet/toshiba/ps3_gelic_net.c", "gelic_net_tx_timeout"], ["drivers/net/ethernet/toshiba/ps3_gelic_wireless.c", "gelic_net_tx_timeout"], ["drivers/net/ethernet/toshiba/spider_net.c", "spider_net_tx_timeout"], ["drivers/net/ethernet/toshiba/tc35815.c", "tc35815_tx_timeout"], ["drivers/net/ethernet/via/via-rhine.c", "rhine_tx_timeout"], ["drivers/net/ethernet/wiznet/w5100.c", "w5100_tx_timeout"], ["drivers/net/ethernet/wiznet/w5300.c", "w5300_tx_timeout"], ["drivers/net/ethernet/xilinx/xilinx_emaclite.c", "xemaclite_tx_timeout"], ["drivers/net/ethernet/xircom/xirc2ps_cs.c", "xirc_tx_timeout"], ["drivers/net/fjes/fjes_main.c", "fjes_tx_retry"], ["drivers/net/slip/slip.c", "sl_tx_timeout"], ["include/linux/usb/usbnet.h", "usbnet_tx_timeout"], ["drivers/net/usb/aqc111.c", "usbnet_tx_timeout"], ["drivers/net/usb/asix_devices.c", "usbnet_tx_timeout"], ["drivers/net/usb/asix_devices.c", "usbnet_tx_timeout"], ["drivers/net/usb/asix_devices.c", "usbnet_tx_timeout"], ["drivers/net/usb/ax88172a.c", "usbnet_tx_timeout"], ["drivers/net/usb/ax88179_178a.c", "usbnet_tx_timeout"], ["drivers/net/usb/catc.c", "catc_tx_timeout"], ["drivers/net/usb/cdc_mbim.c", "usbnet_tx_timeout"], ["drivers/net/usb/cdc_ncm.c", "usbnet_tx_timeout"], ["drivers/net/usb/dm9601.c", "usbnet_tx_timeout"], ["drivers/net/usb/hso.c", "hso_net_tx_timeout"], ["drivers/net/usb/int51x1.c", "usbnet_tx_timeout"], ["drivers/net/usb/ipheth.c", "ipheth_tx_timeout"], ["drivers/net/usb/kaweth.c", "kaweth_tx_timeout"], ["drivers/net/usb/lan78xx.c", "lan78xx_tx_timeout"], ["drivers/net/usb/mcs7830.c", "usbnet_tx_timeout"], ["drivers/net/usb/pegasus.c", "pegasus_tx_timeout"], ["drivers/net/usb/qmi_wwan.c", "usbnet_tx_timeout"], ["drivers/net/usb/r8152.c", "rtl8152_tx_timeout"], ["drivers/net/usb/rndis_host.c", "usbnet_tx_timeout"], ["drivers/net/usb/rtl8150.c", "rtl8150_tx_timeout"], ["drivers/net/usb/sierra_net.c", "usbnet_tx_timeout"], ["drivers/net/usb/smsc75xx.c", "usbnet_tx_timeout"], ["drivers/net/usb/smsc95xx.c", "usbnet_tx_timeout"], ["drivers/net/usb/sr9700.c", "usbnet_tx_timeout"], ["drivers/net/usb/sr9800.c", "usbnet_tx_timeout"], ["drivers/net/usb/usbnet.c", "usbnet_tx_timeout"], ["drivers/net/vmxnet3/vmxnet3_drv.c", "vmxnet3_tx_timeout"], ["drivers/net/wan/cosa.c", "cosa_net_timeout"], ["drivers/net/wan/farsync.c", "fst_tx_timeout"], ["drivers/net/wan/fsl_ucc_hdlc.c", "uhdlc_tx_timeout"], ["drivers/net/wan/lmc/lmc_main.c", "lmc_driver_timeout"], ["drivers/net/wan/x25_asy.c", "x25_asy_timeout"], ["drivers/net/wimax/i2400m/netdev.c", "i2400m_tx_timeout"], ["drivers/net/wireless/intel/ipw2x00/ipw2100.c", "ipw2100_tx_timeout"], ["drivers/net/wireless/intersil/hostap/hostap_main.c", "prism2_tx_timeout"], ["drivers/net/wireless/intersil/hostap/hostap_main.c", "prism2_tx_timeout"], ["drivers/net/wireless/intersil/hostap/hostap_main.c", "prism2_tx_timeout"], ["drivers/net/wireless/intersil/orinoco/main.c", "orinoco_tx_timeout"], ["drivers/net/wireless/intersil/orinoco/orinoco_usb.c", "orinoco_tx_timeout"], ["drivers/net/wireless/intersil/orinoco/orinoco.h", "orinoco_tx_timeout"], ["drivers/net/wireless/intersil/prism54/islpci_dev.c", "islpci_eth_tx_timeout"], ["drivers/net/wireless/intersil/prism54/islpci_eth.c", "islpci_eth_tx_timeout"], ["drivers/net/wireless/intersil/prism54/islpci_eth.h", "islpci_eth_tx_timeout"], ["drivers/net/wireless/marvell/mwifiex/main.c", "mwifiex_tx_timeout"], ["drivers/net/wireless/quantenna/qtnfmac/core.c", "qtnf_netdev_tx_timeout"], ["drivers/net/wireless/quantenna/qtnfmac/core.h", "qtnf_netdev_tx_timeout"], ["drivers/net/wireless/rndis_wlan.c", "usbnet_tx_timeout"], ["drivers/net/wireless/wl3501_cs.c", "wl3501_tx_timeout"], ["drivers/net/wireless/zydas/zd1201.c", "zd1201_tx_timeout"], ["drivers/s390/net/qeth_core.h", "qeth_tx_timeout"], ["drivers/s390/net/qeth_core_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l2_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l2_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l3_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l3_main.c", "qeth_tx_timeout"], ["drivers/staging/ks7010/ks_wlan_net.c", "ks_wlan_tx_timeout"], ["drivers/staging/qlge/qlge_main.c", "qlge_tx_timeout"], ["drivers/staging/rtl8192e/rtl8192e/rtl_core.c", "_rtl92e_tx_timeout"], ["drivers/staging/rtl8192u/r8192U_core.c", "tx_timeout"], ["drivers/staging/unisys/visornic/visornic_main.c", "visornic_xmit_timeout"], ["drivers/staging/wlan-ng/p80211netdev.c", "p80211knetdev_tx_timeout"], ["drivers/tty/n_gsm.c", "gsm_mux_net_tx_timeout"], ["drivers/tty/synclink.c", "hdlcdev_tx_timeout"], ["drivers/tty/synclink_gt.c", "hdlcdev_tx_timeout"], ["drivers/tty/synclinkmp.c", "hdlcdev_tx_timeout"], ["net/atm/lec.c", "lec_tx_timeout"], ["net/bluetooth/bnep/netdev.c", "bnep_net_timeout"] ); for my $p (@work) { my @pair = @$p; my $file = $pair[0]; my $func = $pair[1]; print STDERR $file , ": ", $func,"\n"; our @ARGV = ($file); while () { if (m/($func\s*\(struct\s+net_device\s+\*[A-Za-z_]?[A-Za-z-0-9_]*)(\))/) { print STDERR "found $1+$2 in $file\n"; } if (s/($func\s*\(struct\s+net_device\s+\*[A-Za-z_]?[A-Za-z-0-9_]*)(\))/$1, unsigned int txqueue$2/) { print STDERR "$func found in $file\n"; } print; } } where the list of files and functions is simply from: git grep ndo_tx_timeout, with manual addition of headers in the rare cases where the function is from a header, then manually changing the few places which actually call ndo_tx_timeout. Signed-off-by: Michael S. Tsirkin Acked-by: Heiner Kallweit Acked-by: Jakub Kicinski Acked-by: Shannon Nelson Reviewed-by: Martin Habets changes from v9: fixup a forward declaration changes from v9: more leftovers from v3 change changes from v8: fix up a missing direct call to timeout rebased on net-next changes from v7: fixup leftovers from v3 change changes from v6: fix typo in rtl driver changes from v5: add missing files (allow any net device argument name) changes from v4: add a missing driver header changes from v3: change queue # to unsigned Changes from v2: added headers Changes from v1: Fix errors found by kbuild: generalize the pattern a bit, to pick up a couple of instances missed by the previous version. Signed-off-by: David S. Miller --- arch/m68k/emu/nfeth.c | 2 +- arch/um/drivers/net_kern.c | 2 +- arch/um/drivers/vector_kern.c | 2 +- arch/xtensa/platforms/iss/network.c | 2 +- drivers/char/pcmcia/synclink_cs.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- drivers/message/fusion/mptlan.c | 2 +- drivers/misc/sgi-xp/xpnet.c | 2 +- drivers/net/appletalk/cops.c | 4 ++-- drivers/net/arcnet/arcdevice.h | 2 +- drivers/net/arcnet/arcnet.c | 2 +- drivers/net/ethernet/3com/3c509.c | 4 ++-- drivers/net/ethernet/3com/3c515.c | 4 ++-- drivers/net/ethernet/3com/3c574_cs.c | 4 ++-- drivers/net/ethernet/3com/3c589_cs.c | 4 ++-- drivers/net/ethernet/3com/3c59x.c | 4 ++-- drivers/net/ethernet/3com/typhoon.c | 2 +- drivers/net/ethernet/8390/8390.c | 4 ++-- drivers/net/ethernet/8390/8390.h | 4 ++-- drivers/net/ethernet/8390/8390p.c | 4 ++-- drivers/net/ethernet/8390/axnet_cs.c | 4 ++-- drivers/net/ethernet/8390/lib8390.c | 2 +- drivers/net/ethernet/adaptec/starfire.c | 4 ++-- drivers/net/ethernet/agere/et131x.c | 2 +- drivers/net/ethernet/allwinner/sun4i-emac.c | 2 +- drivers/net/ethernet/alteon/acenic.c | 4 ++-- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/amd/7990.c | 2 +- drivers/net/ethernet/amd/7990.h | 2 +- drivers/net/ethernet/amd/a2065.c | 2 +- drivers/net/ethernet/amd/am79c961a.c | 2 +- drivers/net/ethernet/amd/amd8111e.c | 2 +- drivers/net/ethernet/amd/ariadne.c | 2 +- drivers/net/ethernet/amd/atarilance.c | 4 ++-- drivers/net/ethernet/amd/au1000_eth.c | 2 +- drivers/net/ethernet/amd/declance.c | 2 +- drivers/net/ethernet/amd/lance.c | 4 ++-- drivers/net/ethernet/amd/ni65.c | 4 ++-- drivers/net/ethernet/amd/nmclan_cs.c | 4 ++-- drivers/net/ethernet/amd/pcnet32.c | 4 ++-- drivers/net/ethernet/amd/sunlance.c | 2 +- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +- drivers/net/ethernet/apm/xgene-v2/main.c | 2 +- drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 2 +- drivers/net/ethernet/apple/macmace.c | 4 ++-- drivers/net/ethernet/atheros/ag71xx.c | 2 +- drivers/net/ethernet/atheros/alx/main.c | 2 +- drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 2 +- drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 2 +- drivers/net/ethernet/atheros/atlx/atl2.c | 2 +- drivers/net/ethernet/atheros/atlx/atlx.c | 2 +- drivers/net/ethernet/broadcom/b44.c | 2 +- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- drivers/net/ethernet/broadcom/bnx2.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 2 +- drivers/net/ethernet/broadcom/sb1250-mac.c | 4 ++-- drivers/net/ethernet/broadcom/tg3.c | 2 +- drivers/net/ethernet/calxeda/xgmac.c | 2 +- drivers/net/ethernet/cavium/liquidio/lio_main.c | 2 +- drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 2 +- drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c | 4 ++-- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 +- drivers/net/ethernet/cirrus/cs89x0.c | 2 +- drivers/net/ethernet/cisco/enic/enic_main.c | 2 +- drivers/net/ethernet/cortina/gemini.c | 2 +- drivers/net/ethernet/davicom/dm9000.c | 2 +- drivers/net/ethernet/dec/tulip/de2104x.c | 2 +- drivers/net/ethernet/dec/tulip/tulip_core.c | 4 ++-- drivers/net/ethernet/dec/tulip/winbond-840.c | 4 ++-- drivers/net/ethernet/dlink/dl2k.c | 4 ++-- drivers/net/ethernet/dlink/sundance.c | 4 ++-- drivers/net/ethernet/emulex/benet/be_main.c | 2 +- drivers/net/ethernet/ethoc.c | 2 +- drivers/net/ethernet/faraday/ftgmac100.c | 2 +- drivers/net/ethernet/fealnx.c | 4 ++-- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- drivers/net/ethernet/freescale/fec_main.c | 2 +- drivers/net/ethernet/freescale/fec_mpc52xx.c | 2 +- drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 2 +- drivers/net/ethernet/freescale/ucc_geth.c | 2 +- drivers/net/ethernet/fujitsu/fmvj18x_cs.c | 4 ++-- drivers/net/ethernet/google/gve/gve_main.c | 2 +- drivers/net/ethernet/hisilicon/hip04_eth.c | 2 +- drivers/net/ethernet/hisilicon/hix5hd2_gmac.c | 2 +- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- drivers/net/ethernet/huawei/hinic/hinic_main.c | 2 +- drivers/net/ethernet/i825xx/82596.c | 4 ++-- drivers/net/ethernet/i825xx/ether1.c | 4 ++-- drivers/net/ethernet/i825xx/lib82596.c | 4 ++-- drivers/net/ethernet/i825xx/sun3_82586.c | 4 ++-- drivers/net/ethernet/ibm/ehea/ehea_main.c | 2 +- drivers/net/ethernet/ibm/emac/core.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- drivers/net/ethernet/intel/e100.c | 2 +- drivers/net/ethernet/intel/e1000/e1000_main.c | 4 ++-- drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++-- drivers/net/ethernet/intel/igbvf/netdev.c | 2 +- drivers/net/ethernet/intel/ixgb/ixgb_main.c | 4 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c | 4 +++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/ethernet/jme.c | 2 +- drivers/net/ethernet/korina.c | 2 +- drivers/net/ethernet/lantiq_etop.c | 2 +- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- drivers/net/ethernet/marvell/pxa168_eth.c | 2 +- drivers/net/ethernet/marvell/skge.c | 2 +- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/micrel/ks8842.c | 2 +- drivers/net/ethernet/micrel/ksz884x.c | 2 +- drivers/net/ethernet/microchip/enc28j60.c | 2 +- drivers/net/ethernet/microchip/encx24j600.c | 2 +- drivers/net/ethernet/natsemi/natsemi.c | 4 ++-- drivers/net/ethernet/natsemi/ns83820.c | 4 ++-- drivers/net/ethernet/natsemi/sonic.c | 2 +- drivers/net/ethernet/natsemi/sonic.h | 2 +- drivers/net/ethernet/neterion/s2io.c | 2 +- drivers/net/ethernet/neterion/s2io.h | 2 +- drivers/net/ethernet/neterion/vxge/vxge-main.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- drivers/net/ethernet/nvidia/forcedeth.c | 2 +- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 2 +- drivers/net/ethernet/packetengines/hamachi.c | 4 ++-- drivers/net/ethernet/packetengines/yellowfin.c | 4 ++-- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 2 +- drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c | 4 ++-- drivers/net/ethernet/qlogic/qla3xxx.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 4 ++-- drivers/net/ethernet/qualcomm/emac/emac.c | 2 +- drivers/net/ethernet/qualcomm/qca_spi.c | 2 +- drivers/net/ethernet/qualcomm/qca_uart.c | 2 +- drivers/net/ethernet/rdc/r6040.c | 2 +- drivers/net/ethernet/realtek/8139cp.c | 2 +- drivers/net/ethernet/realtek/8139too.c | 4 ++-- drivers/net/ethernet/realtek/atp.c | 4 ++-- drivers/net/ethernet/realtek/r8169_main.c | 2 +- drivers/net/ethernet/renesas/ravb_main.c | 2 +- drivers/net/ethernet/renesas/sh_eth.c | 2 +- drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 2 +- drivers/net/ethernet/seeq/ether3.c | 4 ++-- drivers/net/ethernet/seeq/sgiseeq.c | 2 +- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/ethernet/sfc/falcon/efx.c | 2 +- drivers/net/ethernet/sgi/ioc3-eth.c | 4 ++-- drivers/net/ethernet/sgi/meth.c | 4 ++-- drivers/net/ethernet/silan/sc92031.c | 2 +- drivers/net/ethernet/sis/sis190.c | 2 +- drivers/net/ethernet/sis/sis900.c | 4 ++-- drivers/net/ethernet/smsc/epic100.c | 4 ++-- drivers/net/ethernet/smsc/smc911x.c | 2 +- drivers/net/ethernet/smsc/smc9194.c | 4 ++-- drivers/net/ethernet/smsc/smc91c92_cs.c | 4 ++-- drivers/net/ethernet/smsc/smc91x.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/ethernet/sun/cassini.c | 2 +- drivers/net/ethernet/sun/niu.c | 2 +- drivers/net/ethernet/sun/sunbmac.c | 2 +- drivers/net/ethernet/sun/sungem.c | 2 +- drivers/net/ethernet/sun/sunhme.c | 2 +- drivers/net/ethernet/sun/sunqe.c | 2 +- drivers/net/ethernet/sun/sunvnet_common.c | 2 +- drivers/net/ethernet/sun/sunvnet_common.h | 2 +- drivers/net/ethernet/synopsys/dwc-xlgmac-net.c | 2 +- drivers/net/ethernet/ti/cpmac.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.h | 2 +- drivers/net/ethernet/ti/davinci_emac.c | 2 +- drivers/net/ethernet/ti/netcp_core.c | 2 +- drivers/net/ethernet/ti/tlan.c | 6 +++--- drivers/net/ethernet/toshiba/ps3_gelic_net.c | 2 +- drivers/net/ethernet/toshiba/ps3_gelic_net.h | 2 +- drivers/net/ethernet/toshiba/spider_net.c | 2 +- drivers/net/ethernet/toshiba/tc35815.c | 4 ++-- drivers/net/ethernet/via/via-rhine.c | 4 ++-- drivers/net/ethernet/wiznet/w5100.c | 2 +- drivers/net/ethernet/wiznet/w5300.c | 2 +- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 2 +- drivers/net/ethernet/xircom/xirc2ps_cs.c | 4 ++-- drivers/net/fjes/fjes_main.c | 4 ++-- drivers/net/slip/slip.c | 2 +- drivers/net/usb/catc.c | 2 +- drivers/net/usb/hso.c | 2 +- drivers/net/usb/ipheth.c | 2 +- drivers/net/usb/kaweth.c | 2 +- drivers/net/usb/lan78xx.c | 2 +- drivers/net/usb/pegasus.c | 2 +- drivers/net/usb/r8152.c | 2 +- drivers/net/usb/rtl8150.c | 2 +- drivers/net/usb/usbnet.c | 2 +- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- drivers/net/wan/cosa.c | 4 ++-- drivers/net/wan/farsync.c | 2 +- drivers/net/wan/fsl_ucc_hdlc.c | 2 +- drivers/net/wan/lmc/lmc_main.c | 4 ++-- drivers/net/wan/x25_asy.c | 2 +- drivers/net/wimax/i2400m/netdev.c | 2 +- drivers/net/wireless/intel/ipw2x00/ipw2100.c | 2 +- drivers/net/wireless/intersil/hostap/hostap_main.c | 2 +- drivers/net/wireless/intersil/orinoco/main.c | 2 +- drivers/net/wireless/intersil/orinoco/orinoco.h | 2 +- drivers/net/wireless/intersil/prism54/islpci_eth.c | 2 +- drivers/net/wireless/intersil/prism54/islpci_eth.h | 2 +- drivers/net/wireless/marvell/mwifiex/main.c | 2 +- drivers/net/wireless/quantenna/qtnfmac/core.c | 2 +- drivers/net/wireless/wl3501_cs.c | 2 +- drivers/net/wireless/zydas/zd1201.c | 2 +- drivers/s390/net/qeth_core.h | 2 +- drivers/s390/net/qeth_core_main.c | 2 +- drivers/staging/ks7010/ks_wlan_net.c | 4 ++-- drivers/staging/qlge/qlge_main.c | 2 +- drivers/staging/rtl8192e/rtl8192e/rtl_core.c | 2 +- drivers/staging/rtl8192u/r8192U_core.c | 2 +- drivers/staging/unisys/visornic/visornic_main.c | 2 +- drivers/staging/wlan-ng/p80211netdev.c | 4 ++-- drivers/tty/n_gsm.c | 2 +- drivers/tty/synclink.c | 2 +- drivers/tty/synclink_gt.c | 2 +- drivers/tty/synclinkmp.c | 2 +- include/linux/netdevice.h | 5 +++-- include/linux/usb/usbnet.h | 2 +- net/atm/lec.c | 2 +- net/bluetooth/bnep/netdev.c | 2 +- net/sched/sch_generic.c | 2 +- 236 files changed, 298 insertions(+), 295 deletions(-) (limited to 'net') diff --git a/arch/m68k/emu/nfeth.c b/arch/m68k/emu/nfeth.c index a4ebd2445eda..d2875e32abfc 100644 --- a/arch/m68k/emu/nfeth.c +++ b/arch/m68k/emu/nfeth.c @@ -167,7 +167,7 @@ static int nfeth_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } -static void nfeth_tx_timeout(struct net_device *dev) +static void nfeth_tx_timeout(struct net_device *dev, unsigned int txqueue) { dev->stats.tx_errors++; netif_wake_queue(dev); diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 327b728f7244..35ebeebfc1a8 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -247,7 +247,7 @@ static void uml_net_set_multicast_list(struct net_device *dev) return; } -static void uml_net_tx_timeout(struct net_device *dev) +static void uml_net_tx_timeout(struct net_device *dev, unsigned int txqueue) { netif_trans_update(dev); netif_wake_queue(dev); diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 92617e16829e..0ff86391f77d 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1332,7 +1332,7 @@ static void vector_net_set_multicast_list(struct net_device *dev) return; } -static void vector_net_tx_timeout(struct net_device *dev) +static void vector_net_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct vector_private *vp = netdev_priv(dev); diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index fa9f3893b002..4986226a5ab2 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -455,7 +455,7 @@ static void iss_net_set_multicast_list(struct net_device *dev) { } -static void iss_net_tx_timeout(struct net_device *dev) +static void iss_net_tx_timeout(struct net_device *dev, unsigned int txqueue) { } diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c index 82f9a6a814ae..e342daa73d1b 100644 --- a/drivers/char/pcmcia/synclink_cs.c +++ b/drivers/char/pcmcia/synclink_cs.c @@ -4169,7 +4169,7 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * * dev pointer to network device structure */ -static void hdlcdev_tx_timeout(struct net_device *dev) +static void hdlcdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { MGSLPC_INFO *info = dev_to_port(dev); unsigned long flags; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index e5f438ab716c..4a0d3a9e72e1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1182,7 +1182,7 @@ unref: return NETDEV_TX_OK; } -static void ipoib_timeout(struct net_device *dev) +static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) { struct ipoib_dev_priv *priv = ipoib_priv(dev); diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index ebc00d47abf5..7d3784aa20e5 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -552,7 +552,7 @@ mpt_lan_close(struct net_device *dev) /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ /* Tx timeout handler. */ static void -mpt_lan_tx_timeout(struct net_device *dev) +mpt_lan_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mpt_lan_priv *priv = netdev_priv(dev); MPT_ADAPTER *mpt_dev = priv->mpt_dev; diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c index f7d610a22347..ada94e6a3c91 100644 --- a/drivers/misc/sgi-xp/xpnet.c +++ b/drivers/misc/sgi-xp/xpnet.c @@ -496,7 +496,7 @@ xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) * Deal with transmit timeouts coming from the network layer. */ static void -xpnet_dev_tx_timeout(struct net_device *dev) +xpnet_dev_tx_timeout(struct net_device *dev, unsigned int txqueue) { dev->stats.tx_errors++; } diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c index b3c63d2f16aa..18428e104445 100644 --- a/drivers/net/appletalk/cops.c +++ b/drivers/net/appletalk/cops.c @@ -189,7 +189,7 @@ static int cops_nodeid (struct net_device *dev, int nodeid); static irqreturn_t cops_interrupt (int irq, void *dev_id); static void cops_poll(struct timer_list *t); -static void cops_timeout(struct net_device *dev); +static void cops_timeout(struct net_device *dev, unsigned int txqueue); static void cops_rx (struct net_device *dev); static netdev_tx_t cops_send_packet (struct sk_buff *skb, struct net_device *dev); @@ -844,7 +844,7 @@ static void cops_rx(struct net_device *dev) netif_rx(skb); } -static void cops_timeout(struct net_device *dev) +static void cops_timeout(struct net_device *dev, unsigned int txqueue) { struct cops_local *lp = netdev_priv(dev); int ioaddr = dev->base_addr; diff --git a/drivers/net/arcnet/arcdevice.h b/drivers/net/arcnet/arcdevice.h index b0f5bc07aef5..22a49c6d7ae6 100644 --- a/drivers/net/arcnet/arcdevice.h +++ b/drivers/net/arcnet/arcdevice.h @@ -356,7 +356,7 @@ int arcnet_open(struct net_device *dev); int arcnet_close(struct net_device *dev); netdev_tx_t arcnet_send_packet(struct sk_buff *skb, struct net_device *dev); -void arcnet_timeout(struct net_device *dev); +void arcnet_timeout(struct net_device *dev, unsigned int txqueue); /* I/O equivalents */ diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index 553776cc1d29..e04efc0a5c97 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -763,7 +763,7 @@ static int go_tx(struct net_device *dev) } /* Called by the kernel when transmit times out */ -void arcnet_timeout(struct net_device *dev) +void arcnet_timeout(struct net_device *dev, unsigned int txqueue) { unsigned long flags; struct arcnet_local *lp = netdev_priv(dev); diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c index 3da97996bdf3..8cafd06ff0c4 100644 --- a/drivers/net/ethernet/3com/3c509.c +++ b/drivers/net/ethernet/3com/3c509.c @@ -196,7 +196,7 @@ static struct net_device_stats *el3_get_stats(struct net_device *dev); static int el3_rx(struct net_device *dev); static int el3_close(struct net_device *dev); static void set_multicast_list(struct net_device *dev); -static void el3_tx_timeout (struct net_device *dev); +static void el3_tx_timeout (struct net_device *dev, unsigned int txqueue); static void el3_down(struct net_device *dev); static void el3_up(struct net_device *dev); static const struct ethtool_ops ethtool_ops; @@ -689,7 +689,7 @@ el3_open(struct net_device *dev) } static void -el3_tx_timeout (struct net_device *dev) +el3_tx_timeout (struct net_device *dev, unsigned int txqueue) { int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c index b15752267c8d..1e233e2f0a5a 100644 --- a/drivers/net/ethernet/3com/3c515.c +++ b/drivers/net/ethernet/3com/3c515.c @@ -371,7 +371,7 @@ static void corkscrew_timer(struct timer_list *t); static netdev_tx_t corkscrew_start_xmit(struct sk_buff *skb, struct net_device *dev); static int corkscrew_rx(struct net_device *dev); -static void corkscrew_timeout(struct net_device *dev); +static void corkscrew_timeout(struct net_device *dev, unsigned int txqueue); static int boomerang_rx(struct net_device *dev); static irqreturn_t corkscrew_interrupt(int irq, void *dev_id); static int corkscrew_close(struct net_device *dev); @@ -961,7 +961,7 @@ static void corkscrew_timer(struct timer_list *t) #endif /* AUTOMEDIA */ } -static void corkscrew_timeout(struct net_device *dev) +static void corkscrew_timeout(struct net_device *dev, unsigned int txqueue) { int i; struct corkscrew_private *vp = netdev_priv(dev); diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c index 3044a6f35f04..ef1c3151fbb2 100644 --- a/drivers/net/ethernet/3com/3c574_cs.c +++ b/drivers/net/ethernet/3com/3c574_cs.c @@ -234,7 +234,7 @@ static void update_stats(struct net_device *dev); static struct net_device_stats *el3_get_stats(struct net_device *dev); static int el3_rx(struct net_device *dev, int worklimit); static int el3_close(struct net_device *dev); -static void el3_tx_timeout(struct net_device *dev); +static void el3_tx_timeout(struct net_device *dev, unsigned int txqueue); static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void set_rx_mode(struct net_device *dev); static void set_multicast_list(struct net_device *dev); @@ -690,7 +690,7 @@ static int el3_open(struct net_device *dev) return 0; } -static void el3_tx_timeout(struct net_device *dev) +static void el3_tx_timeout(struct net_device *dev, unsigned int txqueue) { unsigned int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/3com/3c589_cs.c b/drivers/net/ethernet/3com/3c589_cs.c index 2b2695311bda..d47cde6c5f08 100644 --- a/drivers/net/ethernet/3com/3c589_cs.c +++ b/drivers/net/ethernet/3com/3c589_cs.c @@ -173,7 +173,7 @@ static void update_stats(struct net_device *dev); static struct net_device_stats *el3_get_stats(struct net_device *dev); static int el3_rx(struct net_device *dev); static int el3_close(struct net_device *dev); -static void el3_tx_timeout(struct net_device *dev); +static void el3_tx_timeout(struct net_device *dev, unsigned int txqueue); static void set_rx_mode(struct net_device *dev); static void set_multicast_list(struct net_device *dev); static const struct ethtool_ops netdev_ethtool_ops; @@ -526,7 +526,7 @@ static int el3_open(struct net_device *dev) return 0; } -static void el3_tx_timeout(struct net_device *dev) +static void el3_tx_timeout(struct net_device *dev, unsigned int txqueue) { unsigned int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index 8785c2ff3825..fc046797c0ea 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -776,7 +776,7 @@ static void set_rx_mode(struct net_device *dev); #ifdef CONFIG_PCI static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); #endif -static void vortex_tx_timeout(struct net_device *dev); +static void vortex_tx_timeout(struct net_device *dev, unsigned int txqueue); static void acpi_set_WOL(struct net_device *dev); static const struct ethtool_ops vortex_ethtool_ops; static void set_8021q_mode(struct net_device *dev, int enable); @@ -1877,7 +1877,7 @@ leave_media_alone: iowrite16(FakeIntr, ioaddr + EL3_CMD); } -static void vortex_tx_timeout(struct net_device *dev) +static void vortex_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct vortex_private *vp = netdev_priv(dev); void __iomem *ioaddr = vp->ioaddr; diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index be823c186517..14fce6658106 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -2013,7 +2013,7 @@ typhoon_stop_runtime(struct typhoon *tp, int wait_type) } static void -typhoon_tx_timeout(struct net_device *dev) +typhoon_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct typhoon *tp = netdev_priv(dev); diff --git a/drivers/net/ethernet/8390/8390.c b/drivers/net/ethernet/8390/8390.c index 78f3e532c600..0e0aa4016858 100644 --- a/drivers/net/ethernet/8390/8390.c +++ b/drivers/net/ethernet/8390/8390.c @@ -36,9 +36,9 @@ void ei_set_multicast_list(struct net_device *dev) } EXPORT_SYMBOL(ei_set_multicast_list); -void ei_tx_timeout(struct net_device *dev) +void ei_tx_timeout(struct net_device *dev, unsigned int txqueue) { - __ei_tx_timeout(dev); + __ei_tx_timeout(dev, txqueue); } EXPORT_SYMBOL(ei_tx_timeout); diff --git a/drivers/net/ethernet/8390/8390.h b/drivers/net/ethernet/8390/8390.h index 3e2f2c2e7b58..529c728f334a 100644 --- a/drivers/net/ethernet/8390/8390.h +++ b/drivers/net/ethernet/8390/8390.h @@ -32,7 +32,7 @@ void NS8390_init(struct net_device *dev, int startp); int ei_open(struct net_device *dev); int ei_close(struct net_device *dev); irqreturn_t ei_interrupt(int irq, void *dev_id); -void ei_tx_timeout(struct net_device *dev); +void ei_tx_timeout(struct net_device *dev, unsigned int txqueue); netdev_tx_t ei_start_xmit(struct sk_buff *skb, struct net_device *dev); void ei_set_multicast_list(struct net_device *dev); struct net_device_stats *ei_get_stats(struct net_device *dev); @@ -50,7 +50,7 @@ void NS8390p_init(struct net_device *dev, int startp); int eip_open(struct net_device *dev); int eip_close(struct net_device *dev); irqreturn_t eip_interrupt(int irq, void *dev_id); -void eip_tx_timeout(struct net_device *dev); +void eip_tx_timeout(struct net_device *dev, unsigned int txqueue); netdev_tx_t eip_start_xmit(struct sk_buff *skb, struct net_device *dev); void eip_set_multicast_list(struct net_device *dev); struct net_device_stats *eip_get_stats(struct net_device *dev); diff --git a/drivers/net/ethernet/8390/8390p.c b/drivers/net/ethernet/8390/8390p.c index 6cf36992a2c6..6834742057b3 100644 --- a/drivers/net/ethernet/8390/8390p.c +++ b/drivers/net/ethernet/8390/8390p.c @@ -41,9 +41,9 @@ void eip_set_multicast_list(struct net_device *dev) } EXPORT_SYMBOL(eip_set_multicast_list); -void eip_tx_timeout(struct net_device *dev) +void eip_tx_timeout(struct net_device *dev, unsigned int txqueue) { - __ei_tx_timeout(dev); + __ei_tx_timeout(dev, txqueue); } EXPORT_SYMBOL(eip_tx_timeout); diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c index 0b6bbf63f7ca..aeae7966a082 100644 --- a/drivers/net/ethernet/8390/axnet_cs.c +++ b/drivers/net/ethernet/8390/axnet_cs.c @@ -83,7 +83,7 @@ static netdev_tx_t axnet_start_xmit(struct sk_buff *skb, struct net_device *dev); static struct net_device_stats *get_stats(struct net_device *dev); static void set_multicast_list(struct net_device *dev); -static void axnet_tx_timeout(struct net_device *dev); +static void axnet_tx_timeout(struct net_device *dev, unsigned int txqueue); static irqreturn_t ei_irq_wrapper(int irq, void *dev_id); static void ei_watchdog(struct timer_list *t); static void axnet_reset_8390(struct net_device *dev); @@ -903,7 +903,7 @@ static int ax_close(struct net_device *dev) * completed (or failed) - i.e. never posted a Tx related interrupt. */ -static void axnet_tx_timeout(struct net_device *dev) +static void axnet_tx_timeout(struct net_device *dev, unsigned int txqueue) { long e8390_base = dev->base_addr; struct ei_device *ei_local = netdev_priv(dev); diff --git a/drivers/net/ethernet/8390/lib8390.c b/drivers/net/ethernet/8390/lib8390.c index c9c55c9eab9f..babc92e2692e 100644 --- a/drivers/net/ethernet/8390/lib8390.c +++ b/drivers/net/ethernet/8390/lib8390.c @@ -251,7 +251,7 @@ static int __ei_close(struct net_device *dev) * completed (or failed) - i.e. never posted a Tx related interrupt. */ -static void __ei_tx_timeout(struct net_device *dev) +static void __ei_tx_timeout(struct net_device *dev, unsigned int txqueue) { unsigned long e8390_base = dev->base_addr; struct ei_device *ei_local = netdev_priv(dev); diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 816540e6beac..165d18405b0c 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -576,7 +576,7 @@ static int mdio_read(struct net_device *dev, int phy_id, int location); static void mdio_write(struct net_device *dev, int phy_id, int location, int value); static int netdev_open(struct net_device *dev); static void check_duplex(struct net_device *dev); -static void tx_timeout(struct net_device *dev); +static void tx_timeout(struct net_device *dev, unsigned int txqueue); static void init_ring(struct net_device *dev); static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev); static irqreturn_t intr_handler(int irq, void *dev_instance); @@ -1105,7 +1105,7 @@ static void check_duplex(struct net_device *dev) } -static void tx_timeout(struct net_device *dev) +static void tx_timeout(struct net_device *dev, unsigned int txqueue) { struct netdev_private *np = netdev_priv(dev); void __iomem *ioaddr = np->base; diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c index 174344c450af..3c51d8c502ed 100644 --- a/drivers/net/ethernet/agere/et131x.c +++ b/drivers/net/ethernet/agere/et131x.c @@ -3811,7 +3811,7 @@ drop_err: * specified by the 'tx_timeo" element in the net_device structure (see * et131x_alloc_device() to see how this value is set). */ -static void et131x_tx_timeout(struct net_device *netdev) +static void et131x_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct et131x_adapter *adapter = netdev_priv(netdev); struct tx_ring *tx_ring = &adapter->tx_ring; diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index 0537df06a9b5..5ea806423e4c 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -407,7 +407,7 @@ static void emac_init_device(struct net_device *dev) } /* Our watchdog timed out. Called by the networking layer */ -static void emac_timeout(struct net_device *dev) +static void emac_timeout(struct net_device *dev, unsigned int txqueue) { struct emac_board_info *db = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c index 46b4207d3266..f366faf88eee 100644 --- a/drivers/net/ethernet/alteon/acenic.c +++ b/drivers/net/ethernet/alteon/acenic.c @@ -437,7 +437,7 @@ static const struct ethtool_ops ace_ethtool_ops = { .set_link_ksettings = ace_set_link_ksettings, }; -static void ace_watchdog(struct net_device *dev); +static void ace_watchdog(struct net_device *dev, unsigned int txqueue); static const struct net_device_ops ace_netdev_ops = { .ndo_open = ace_open, @@ -1542,7 +1542,7 @@ static void ace_set_rxtx_parms(struct net_device *dev, int jumbo) } -static void ace_watchdog(struct net_device *data) +static void ace_watchdog(struct net_device *data, unsigned int txqueue) { struct net_device *dev = data; struct ace_private *ap = netdev_priv(dev); diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index d161537e24e2..26954fde4766 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -108,7 +108,7 @@ static void ena_unmap_tx_buff(struct ena_ring *tx_ring, static int ena_create_io_tx_queues_in_range(struct ena_adapter *adapter, int first_index, int count); -static void ena_tx_timeout(struct net_device *dev) +static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct ena_adapter *adapter = netdev_priv(dev); diff --git a/drivers/net/ethernet/amd/7990.c b/drivers/net/ethernet/amd/7990.c index ab30761003da..cf3562e82ca9 100644 --- a/drivers/net/ethernet/amd/7990.c +++ b/drivers/net/ethernet/amd/7990.c @@ -527,7 +527,7 @@ int lance_close(struct net_device *dev) } EXPORT_SYMBOL_GPL(lance_close); -void lance_tx_timeout(struct net_device *dev) +void lance_tx_timeout(struct net_device *dev, unsigned int txqueue) { printk("lance_tx_timeout\n"); lance_reset(dev); diff --git a/drivers/net/ethernet/amd/7990.h b/drivers/net/ethernet/amd/7990.h index 741cdc392c6b..8266b3c1fefc 100644 --- a/drivers/net/ethernet/amd/7990.h +++ b/drivers/net/ethernet/amd/7990.h @@ -243,7 +243,7 @@ int lance_open(struct net_device *dev); int lance_close(struct net_device *dev); int lance_start_xmit(struct sk_buff *skb, struct net_device *dev); void lance_set_multicast(struct net_device *dev); -void lance_tx_timeout(struct net_device *dev); +void lance_tx_timeout(struct net_device *dev, unsigned int txqueue); #ifdef CONFIG_NET_POLL_CONTROLLER void lance_poll(struct net_device *dev); #endif diff --git a/drivers/net/ethernet/amd/a2065.c b/drivers/net/ethernet/amd/a2065.c index 212fe72a190b..a3faf4feb204 100644 --- a/drivers/net/ethernet/amd/a2065.c +++ b/drivers/net/ethernet/amd/a2065.c @@ -522,7 +522,7 @@ static inline int lance_reset(struct net_device *dev) return status; } -static void lance_tx_timeout(struct net_device *dev) +static void lance_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct lance_private *lp = netdev_priv(dev); volatile struct lance_regs *ll = lp->ll; diff --git a/drivers/net/ethernet/amd/am79c961a.c b/drivers/net/ethernet/amd/am79c961a.c index 0842da492a64..1c53408f5d47 100644 --- a/drivers/net/ethernet/amd/am79c961a.c +++ b/drivers/net/ethernet/amd/am79c961a.c @@ -422,7 +422,7 @@ static void am79c961_setmulticastlist (struct net_device *dev) spin_unlock_irqrestore(&priv->chip_lock, flags); } -static void am79c961_timeout(struct net_device *dev) +static void am79c961_timeout(struct net_device *dev, unsigned int txqueue) { printk(KERN_WARNING "%s: transmit timed out, network cable problem?\n", dev->name); diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c index 573e88fc8ede..0f3b743425e8 100644 --- a/drivers/net/ethernet/amd/amd8111e.c +++ b/drivers/net/ethernet/amd/amd8111e.c @@ -1569,7 +1569,7 @@ static int amd8111e_enable_link_change(struct amd8111e_priv *lp) * failed or the interface is locked up. This function will reinitialize * the hardware. */ -static void amd8111e_tx_timeout(struct net_device *dev) +static void amd8111e_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct amd8111e_priv *lp = netdev_priv(dev); int err; diff --git a/drivers/net/ethernet/amd/ariadne.c b/drivers/net/ethernet/amd/ariadne.c index 4b6a5cb85dd2..5e0f645f5bde 100644 --- a/drivers/net/ethernet/amd/ariadne.c +++ b/drivers/net/ethernet/amd/ariadne.c @@ -530,7 +530,7 @@ static inline void ariadne_reset(struct net_device *dev) netif_start_queue(dev); } -static void ariadne_tx_timeout(struct net_device *dev) +static void ariadne_tx_timeout(struct net_device *dev, unsigned int txqueue) { volatile struct Am79C960 *lance = (struct Am79C960 *)dev->base_addr; diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c index d3d44e07afbc..4e36122609a3 100644 --- a/drivers/net/ethernet/amd/atarilance.c +++ b/drivers/net/ethernet/amd/atarilance.c @@ -346,7 +346,7 @@ static int lance_rx( struct net_device *dev ); static int lance_close( struct net_device *dev ); static void set_multicast_list( struct net_device *dev ); static int lance_set_mac_address( struct net_device *dev, void *addr ); -static void lance_tx_timeout (struct net_device *dev); +static void lance_tx_timeout (struct net_device *dev, unsigned int txqueue); /************************* End of Prototypes **************************/ @@ -727,7 +727,7 @@ static void lance_init_ring( struct net_device *dev ) /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ -static void lance_tx_timeout (struct net_device *dev) +static void lance_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct lance_private *lp = netdev_priv(dev); struct lance_ioreg *IO = lp->iobase; diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c index 1793950f0582..d832c9f4d306 100644 --- a/drivers/net/ethernet/amd/au1000_eth.c +++ b/drivers/net/ethernet/amd/au1000_eth.c @@ -1014,7 +1014,7 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev) * The Tx ring has been full longer than the watchdog timeout * value. The transmitter must be hung? */ -static void au1000_tx_timeout(struct net_device *dev) +static void au1000_tx_timeout(struct net_device *dev, unsigned int txqueue) { netdev_err(dev, "au1000_tx_timeout: dev=%p\n", dev); au1000_reset_mac(dev); diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index dac4a2fcad6a..6592a2db9efb 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -884,7 +884,7 @@ static inline int lance_reset(struct net_device *dev) return status; } -static void lance_tx_timeout(struct net_device *dev) +static void lance_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct lance_private *lp = netdev_priv(dev); volatile struct lance_regs *ll = lp->ll; diff --git a/drivers/net/ethernet/amd/lance.c b/drivers/net/ethernet/amd/lance.c index f90b454b1642..aff44241988c 100644 --- a/drivers/net/ethernet/amd/lance.c +++ b/drivers/net/ethernet/amd/lance.c @@ -306,7 +306,7 @@ static irqreturn_t lance_interrupt(int irq, void *dev_id); static int lance_close(struct net_device *dev); static struct net_device_stats *lance_get_stats(struct net_device *dev); static void set_multicast_list(struct net_device *dev); -static void lance_tx_timeout (struct net_device *dev); +static void lance_tx_timeout (struct net_device *dev, unsigned int txqueue); @@ -913,7 +913,7 @@ lance_restart(struct net_device *dev, unsigned int csr0_bits, int must_reinit) } -static void lance_tx_timeout (struct net_device *dev) +static void lance_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct lance_private *lp = (struct lance_private *) dev->ml_priv; int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/amd/ni65.c b/drivers/net/ethernet/amd/ni65.c index c6c2a54c1121..c38edf6f03a3 100644 --- a/drivers/net/ethernet/amd/ni65.c +++ b/drivers/net/ethernet/amd/ni65.c @@ -254,7 +254,7 @@ static int ni65_lance_reinit(struct net_device *dev); static void ni65_init_lance(struct priv *p,unsigned char*,int,int); static netdev_tx_t ni65_send_packet(struct sk_buff *skb, struct net_device *dev); -static void ni65_timeout(struct net_device *dev); +static void ni65_timeout(struct net_device *dev, unsigned int txqueue); static int ni65_close(struct net_device *dev); static int ni65_alloc_buffer(struct net_device *dev); static void ni65_free_buffer(struct priv *p); @@ -1133,7 +1133,7 @@ static void ni65_recv_intr(struct net_device *dev,int csr0) * kick xmitter .. */ -static void ni65_timeout(struct net_device *dev) +static void ni65_timeout(struct net_device *dev, unsigned int txqueue) { int i; struct priv *p = dev->ml_priv; diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c index 9c152d85840d..023aecf6ab30 100644 --- a/drivers/net/ethernet/amd/nmclan_cs.c +++ b/drivers/net/ethernet/amd/nmclan_cs.c @@ -407,7 +407,7 @@ static int mace_open(struct net_device *dev); static int mace_close(struct net_device *dev); static netdev_tx_t mace_start_xmit(struct sk_buff *skb, struct net_device *dev); -static void mace_tx_timeout(struct net_device *dev); +static void mace_tx_timeout(struct net_device *dev, unsigned int txqueue); static irqreturn_t mace_interrupt(int irq, void *dev_id); static struct net_device_stats *mace_get_stats(struct net_device *dev); static int mace_rx(struct net_device *dev, unsigned char RxCnt); @@ -837,7 +837,7 @@ mace_start_xmit failed, put skb back into a list." ---------------------------------------------------------------------------- */ -static void mace_tx_timeout(struct net_device *dev) +static void mace_tx_timeout(struct net_device *dev, unsigned int txqueue) { mace_private *lp = netdev_priv(dev); struct pcmcia_device *link = lp->p_dev; diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index f5ad12c10934..dc7d88227e76 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -314,7 +314,7 @@ static int pcnet32_open(struct net_device *); static int pcnet32_init_ring(struct net_device *); static netdev_tx_t pcnet32_start_xmit(struct sk_buff *, struct net_device *); -static void pcnet32_tx_timeout(struct net_device *dev); +static void pcnet32_tx_timeout(struct net_device *dev, unsigned int txqueue); static irqreturn_t pcnet32_interrupt(int, void *); static int pcnet32_close(struct net_device *); static struct net_device_stats *pcnet32_get_stats(struct net_device *); @@ -2455,7 +2455,7 @@ static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits) lp->a->write_csr(ioaddr, CSR0, csr0_bits); } -static void pcnet32_tx_timeout(struct net_device *dev) +static void pcnet32_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct pcnet32_private *lp = netdev_priv(dev); unsigned long ioaddr = dev->base_addr, flags; diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c index ebcbf8ca4829..b00e00881253 100644 --- a/drivers/net/ethernet/amd/sunlance.c +++ b/drivers/net/ethernet/amd/sunlance.c @@ -1097,7 +1097,7 @@ static void lance_piozero(void __iomem *dest, int len) sbus_writeb(0, piobuf); } -static void lance_tx_timeout(struct net_device *dev) +static void lance_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct lance_private *lp = netdev_priv(dev); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 98f8f2033154..b71f9b04a51e 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2152,7 +2152,7 @@ static int xgbe_change_mtu(struct net_device *netdev, int mtu) return 0; } -static void xgbe_tx_timeout(struct net_device *netdev) +static void xgbe_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct xgbe_prv_data *pdata = netdev_priv(netdev); diff --git a/drivers/net/ethernet/apm/xgene-v2/main.c b/drivers/net/ethernet/apm/xgene-v2/main.c index 02b4f3af02b5..c48f60996761 100644 --- a/drivers/net/ethernet/apm/xgene-v2/main.c +++ b/drivers/net/ethernet/apm/xgene-v2/main.c @@ -575,7 +575,7 @@ static void xge_free_pending_skb(struct net_device *ndev) } } -static void xge_timeout(struct net_device *ndev) +static void xge_timeout(struct net_device *ndev, unsigned int txqueue) { struct xge_pdata *pdata = netdev_priv(ndev); diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index d8612131c55e..e284b6753725 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -859,7 +859,7 @@ static int xgene_enet_napi(struct napi_struct *napi, const int budget) return processed; } -static void xgene_enet_timeout(struct net_device *ndev) +static void xgene_enet_timeout(struct net_device *ndev, unsigned int txqueue) { struct xgene_enet_pdata *pdata = netdev_priv(ndev); struct netdev_queue *txq; diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c index 8d03578d5e8c..95d3061c61be 100644 --- a/drivers/net/ethernet/apple/macmace.c +++ b/drivers/net/ethernet/apple/macmace.c @@ -91,7 +91,7 @@ static int mace_set_address(struct net_device *dev, void *addr); static void mace_reset(struct net_device *dev); static irqreturn_t mace_interrupt(int irq, void *dev_id); static irqreturn_t mace_dma_intr(int irq, void *dev_id); -static void mace_tx_timeout(struct net_device *dev); +static void mace_tx_timeout(struct net_device *dev, unsigned int txqueue); static void __mace_set_address(struct net_device *dev, void *addr); /* @@ -600,7 +600,7 @@ static irqreturn_t mace_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void mace_tx_timeout(struct net_device *dev) +static void mace_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mace_data *mp = netdev_priv(dev); volatile struct mace *mb = mp->mace; diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 8f5021091eee..ad8b0e3fcd2c 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -1409,7 +1409,7 @@ static void ag71xx_oom_timer_handler(struct timer_list *t) napi_schedule(&ag->napi); } -static void ag71xx_tx_timeout(struct net_device *ndev) +static void ag71xx_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct ag71xx *ag = netdev_priv(ndev); diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c index d4bbcdfd691a..1dcbc486eca9 100644 --- a/drivers/net/ethernet/atheros/alx/main.c +++ b/drivers/net/ethernet/atheros/alx/main.c @@ -1553,7 +1553,7 @@ static netdev_tx_t alx_start_xmit(struct sk_buff *skb, return alx_start_xmit_ring(skb, alx_tx_queue_mapping(alx, skb)); } -static void alx_tx_timeout(struct net_device *dev) +static void alx_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct alx_priv *alx = netdev_priv(dev); diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c index 2b239ecea05f..4c0b1f8551dd 100644 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c @@ -350,7 +350,7 @@ static void atl1c_del_timer(struct atl1c_adapter *adapter) * atl1c_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure */ -static void atl1c_tx_timeout(struct net_device *netdev) +static void atl1c_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct atl1c_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c index 4f7b65825c15..e0d89942d537 100644 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c @@ -251,7 +251,7 @@ static void atl1e_cancel_work(struct atl1e_adapter *adapter) * atl1e_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure */ -static void atl1e_tx_timeout(struct net_device *netdev) +static void atl1e_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct atl1e_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c index 3aba38322717..b81a4e0c5b57 100644 --- a/drivers/net/ethernet/atheros/atlx/atl2.c +++ b/drivers/net/ethernet/atheros/atlx/atl2.c @@ -1001,7 +1001,7 @@ static int atl2_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) * atl2_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure */ -static void atl2_tx_timeout(struct net_device *netdev) +static void atl2_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct atl2_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/atheros/atlx/atlx.c b/drivers/net/ethernet/atheros/atlx/atlx.c index 505a22c703f7..0941d07d0833 100644 --- a/drivers/net/ethernet/atheros/atlx/atlx.c +++ b/drivers/net/ethernet/atheros/atlx/atlx.c @@ -183,7 +183,7 @@ static void atlx_clear_phy_int(struct atlx_adapter *adapter) * atlx_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure */ -static void atlx_tx_timeout(struct net_device *netdev) +static void atlx_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct atlx_adapter *adapter = netdev_priv(netdev); /* Do the reset outside of interrupt context */ diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index 035dbb1b2c98..5b3464c3e8d1 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -948,7 +948,7 @@ irq_ack: return IRQ_RETVAL(handled); } -static void b44_tx_timeout(struct net_device *dev) +static void b44_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct b44 *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 825af709708e..8e3152779a61 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1354,7 +1354,7 @@ out: return ret; } -static void bcm_sysport_tx_timeout(struct net_device *dev) +static void bcm_sysport_tx_timeout(struct net_device *dev, unsigned int txqueue) { netdev_warn(dev, "transmit timeout!\n"); diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index fbc196b480b6..dbb7874607ca 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -6575,7 +6575,7 @@ bnx2_dump_state(struct bnx2 *bp) } static void -bnx2_tx_timeout(struct net_device *dev) +bnx2_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct bnx2 *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 5e037a305b83..ee9e9290f112 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -4970,7 +4970,7 @@ int bnx2x_set_features(struct net_device *dev, netdev_features_t features) return 0; } -void bnx2x_tx_timeout(struct net_device *dev) +void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct bnx2x *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index 8b08cb18e363..e35f48bfdc85 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -617,7 +617,7 @@ int bnx2x_set_features(struct net_device *dev, netdev_features_t features); * * @dev: net device */ -void bnx2x_tx_timeout(struct net_device *dev); +void bnx2x_tx_timeout(struct net_device *dev, unsigned int txqueue); /** bnx2x_get_c2s_mapping - read inner-to-outer vlan configuration * c2s_map should have BNX2X_MAX_PRIORITY entries. diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 85983f0e3134..4e34841906c7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9976,7 +9976,7 @@ static void bnxt_reset_task(struct bnxt *bp, bool silent) } } -static void bnxt_tx_timeout(struct net_device *dev) +static void bnxt_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct bnxt *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 120fa05a39ff..32f1245a69e2 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3055,7 +3055,7 @@ static void bcmgenet_dump_tx_queue(struct bcmgenet_tx_ring *ring) ring->cb_ptr, ring->end_ptr); } -static void bcmgenet_timeout(struct net_device *dev) +static void bcmgenet_timeout(struct net_device *dev, unsigned int txqueue) { struct bcmgenet_priv *priv = netdev_priv(dev); u32 int0_enable = 0; diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c index 1604ad32e920..80ff52527233 100644 --- a/drivers/net/ethernet/broadcom/sb1250-mac.c +++ b/drivers/net/ethernet/broadcom/sb1250-mac.c @@ -294,7 +294,7 @@ static int sbmac_set_duplex(struct sbmac_softc *s, enum sbmac_duplex duplex, enum sbmac_fc fc); static int sbmac_open(struct net_device *dev); -static void sbmac_tx_timeout (struct net_device *dev); +static void sbmac_tx_timeout (struct net_device *dev, unsigned int txqueue); static void sbmac_set_rx_mode(struct net_device *dev); static int sbmac_mii_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static int sbmac_close(struct net_device *dev); @@ -2419,7 +2419,7 @@ static void sbmac_mii_poll(struct net_device *dev) } -static void sbmac_tx_timeout (struct net_device *dev) +static void sbmac_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct sbmac_softc *sc = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index ca3aa1250dd1..460b4992914a 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7645,7 +7645,7 @@ static void tg3_poll_controller(struct net_device *dev) } #endif -static void tg3_tx_timeout(struct net_device *dev) +static void tg3_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct tg3 *tp = netdev_priv(dev); diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c index af04a2c81adb..05a3d067c3fc 100644 --- a/drivers/net/ethernet/calxeda/xgmac.c +++ b/drivers/net/ethernet/calxeda/xgmac.c @@ -1251,7 +1251,7 @@ static int xgmac_poll(struct napi_struct *napi, int budget) * netdev structure and arrange for the device to be reset to a sane state * in order to transmit a new packet. */ -static void xgmac_tx_timeout(struct net_device *dev) +static void xgmac_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct xgmac_priv *priv = netdev_priv(dev); schedule_work(&priv->tx_timeout_work); diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 7f3b2e3b0868..eab05b5534ea 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -2562,7 +2562,7 @@ lio_xmit_failed: /** \brief Network device Tx timeout * @param netdev pointer to network device */ -static void liquidio_tx_timeout(struct net_device *netdev) +static void liquidio_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct lio *lio; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index 370d76822ee0..7a77544a54f5 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -1628,7 +1628,7 @@ lio_xmit_failed: /** \brief Network device Tx timeout * @param netdev pointer to network device */ -static void liquidio_tx_timeout(struct net_device *netdev) +static void liquidio_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct lio *lio; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c index f3f2e71431ac..600de587d7a9 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c @@ -31,7 +31,7 @@ static int lio_vf_rep_open(struct net_device *ndev); static int lio_vf_rep_stop(struct net_device *ndev); static netdev_tx_t lio_vf_rep_pkt_xmit(struct sk_buff *skb, struct net_device *ndev); -static void lio_vf_rep_tx_timeout(struct net_device *netdev); +static void lio_vf_rep_tx_timeout(struct net_device *netdev, unsigned int txqueue); static int lio_vf_rep_phys_port_name(struct net_device *dev, char *buf, size_t len); static void lio_vf_rep_get_stats64(struct net_device *dev, @@ -172,7 +172,7 @@ lio_vf_rep_stop(struct net_device *ndev) } static void -lio_vf_rep_tx_timeout(struct net_device *ndev) +lio_vf_rep_tx_timeout(struct net_device *ndev, unsigned int txqueue) { netif_trans_update(ndev); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index f28409279ea4..016957285f99 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1741,7 +1741,7 @@ static void nicvf_get_stats64(struct net_device *netdev, } -static void nicvf_tx_timeout(struct net_device *dev) +static void nicvf_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct nicvf *nic = netdev_priv(dev); diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index c9aebcde403a..33ace3307059 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -1128,7 +1128,7 @@ net_get_stats(struct net_device *dev) return &dev->stats; } -static void net_timeout(struct net_device *dev) +static void net_timeout(struct net_device *dev, unsigned int txqueue) { /* If we get here, some higher level has decided we are broken. There should really be a "kick me" function call instead. */ diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index acb2856936d2..bbd7b3175f09 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1095,7 +1095,7 @@ static void enic_set_rx_mode(struct net_device *netdev) } /* netif_tx_lock held, BHs disabled */ -static void enic_tx_timeout(struct net_device *netdev) +static void enic_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct enic *enic = netdev_priv(netdev); schedule_work(&enic->tx_hang_reset); diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index a8f4c69252ff..de0b6e066eef 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1296,7 +1296,7 @@ out_drop: return NETDEV_TX_OK; } -static void gmac_tx_timeout(struct net_device *netdev) +static void gmac_tx_timeout(struct net_device *netdev, unsigned int txqueue) { netdev_err(netdev, "Tx timeout\n"); gmac_dump_dma_state(netdev); diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index cce90b5925d9..1ea3372775e6 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -964,7 +964,7 @@ dm9000_init_dm9000(struct net_device *dev) } /* Our watchdog timed out. Called by the networking layer */ -static void dm9000_timeout(struct net_device *dev) +static void dm9000_timeout(struct net_device *dev, unsigned int txqueue) { struct board_info *db = netdev_priv(dev); u8 reg_save; diff --git a/drivers/net/ethernet/dec/tulip/de2104x.c b/drivers/net/ethernet/dec/tulip/de2104x.c index f1a2da15dd0a..fd3c2abf74b5 100644 --- a/drivers/net/ethernet/dec/tulip/de2104x.c +++ b/drivers/net/ethernet/dec/tulip/de2104x.c @@ -1436,7 +1436,7 @@ static int de_close (struct net_device *dev) return 0; } -static void de_tx_timeout (struct net_device *dev) +static void de_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct de_private *de = netdev_priv(dev); const int irq = de->pdev->irq; diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index 3e3e08698876..9e9d9eee29d9 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -255,7 +255,7 @@ MODULE_DEVICE_TABLE(pci, tulip_pci_tbl); const char tulip_media_cap[32] = {0,0,0,16, 3,19,16,24, 27,4,7,5, 0,20,23,20, 28,31,0,0, }; -static void tulip_tx_timeout(struct net_device *dev); +static void tulip_tx_timeout(struct net_device *dev, unsigned int txqueue); static void tulip_init_ring(struct net_device *dev); static void tulip_free_ring(struct net_device *dev); static netdev_tx_t tulip_start_xmit(struct sk_buff *skb, @@ -534,7 +534,7 @@ free_ring: } -static void tulip_tx_timeout(struct net_device *dev) +static void tulip_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct tulip_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->base_addr; diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c index 70cb2d689c2c..7f136488e67c 100644 --- a/drivers/net/ethernet/dec/tulip/winbond-840.c +++ b/drivers/net/ethernet/dec/tulip/winbond-840.c @@ -331,7 +331,7 @@ static void netdev_timer(struct timer_list *t); static void init_rxtx_rings(struct net_device *dev); static void free_rxtx_rings(struct netdev_private *np); static void init_registers(struct net_device *dev); -static void tx_timeout(struct net_device *dev); +static void tx_timeout(struct net_device *dev, unsigned int txqueue); static int alloc_ringdesc(struct net_device *dev); static void free_ringdesc(struct netdev_private *np); static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev); @@ -921,7 +921,7 @@ static void init_registers(struct net_device *dev) iowrite32(0, ioaddr + RxStartDemand); } -static void tx_timeout(struct net_device *dev) +static void tx_timeout(struct net_device *dev, unsigned int txqueue) { struct netdev_private *np = netdev_priv(dev); void __iomem *ioaddr = np->base_addr; diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c index 55e720d2ea0c..26c5da032b1e 100644 --- a/drivers/net/ethernet/dlink/dl2k.c +++ b/drivers/net/ethernet/dlink/dl2k.c @@ -66,7 +66,7 @@ static const int multicast_filter_limit = 0x40; static int rio_open (struct net_device *dev); static void rio_timer (struct timer_list *t); -static void rio_tx_timeout (struct net_device *dev); +static void rio_tx_timeout (struct net_device *dev, unsigned int txqueue); static netdev_tx_t start_xmit (struct sk_buff *skb, struct net_device *dev); static irqreturn_t rio_interrupt (int irq, void *dev_instance); static void rio_free_tx (struct net_device *dev, int irq); @@ -696,7 +696,7 @@ rio_timer (struct timer_list *t) } static void -rio_tx_timeout (struct net_device *dev) +rio_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct netdev_private *np = netdev_priv(dev); void __iomem *ioaddr = np->ioaddr; diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c index 4a37a69764ce..b91387c456ba 100644 --- a/drivers/net/ethernet/dlink/sundance.c +++ b/drivers/net/ethernet/dlink/sundance.c @@ -432,7 +432,7 @@ static int mdio_wait_link(struct net_device *dev, int wait); static int netdev_open(struct net_device *dev); static void check_duplex(struct net_device *dev); static void netdev_timer(struct timer_list *t); -static void tx_timeout(struct net_device *dev); +static void tx_timeout(struct net_device *dev, unsigned int txqueue); static void init_ring(struct net_device *dev); static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev); static int reset_tx (struct net_device *dev); @@ -969,7 +969,7 @@ static void netdev_timer(struct timer_list *t) add_timer(&np->timer); } -static void tx_timeout(struct net_device *dev) +static void tx_timeout(struct net_device *dev, unsigned int txqueue) { struct netdev_private *np = netdev_priv(dev); void __iomem *ioaddr = np->base; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 39eb7d525043..56f59db6ebf2 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1417,7 +1417,7 @@ drop: return NETDEV_TX_OK; } -static void be_tx_timeout(struct net_device *netdev) +static void be_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct be_adapter *adapter = netdev_priv(netdev); struct device *dev = &adapter->pdev->dev; diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index ea4f17f5cce7..66406da16b60 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -869,7 +869,7 @@ static int ethoc_change_mtu(struct net_device *dev, int new_mtu) return -ENOSYS; } -static void ethoc_tx_timeout(struct net_device *dev) +static void ethoc_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct ethoc *priv = netdev_priv(dev); u32 pending = ethoc_read(priv, INT_SOURCE); diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 8ed85037f021..48b3b72fe02e 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1545,7 +1545,7 @@ static int ftgmac100_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int return phy_mii_ioctl(netdev->phydev, ifr, cmd); } -static void ftgmac100_tx_timeout(struct net_device *netdev) +static void ftgmac100_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ftgmac100 *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c index c24fd56a2c71..84f10970299a 100644 --- a/drivers/net/ethernet/fealnx.c +++ b/drivers/net/ethernet/fealnx.c @@ -428,7 +428,7 @@ static void getlinktype(struct net_device *dev); static void getlinkstatus(struct net_device *dev); static void netdev_timer(struct timer_list *t); static void reset_timer(struct timer_list *t); -static void fealnx_tx_timeout(struct net_device *dev); +static void fealnx_tx_timeout(struct net_device *dev, unsigned int txqueue); static void init_ring(struct net_device *dev); static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev); static irqreturn_t intr_handler(int irq, void *dev_instance); @@ -1191,7 +1191,7 @@ static void reset_timer(struct timer_list *t) } -static void fealnx_tx_timeout(struct net_device *dev) +static void fealnx_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct netdev_private *np = netdev_priv(dev); void __iomem *ioaddr = np->mem; diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 6a9d12dad5d9..a60fc3cfc06e 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -288,7 +288,7 @@ static int dpaa_stop(struct net_device *net_dev) return err; } -static void dpaa_tx_timeout(struct net_device *net_dev) +static void dpaa_tx_timeout(struct net_device *net_dev, unsigned int txqueue) { struct dpaa_percpu_priv *percpu_priv; const struct dpaa_priv *priv; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 05c1899f6628..798fed37be46 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1141,7 +1141,7 @@ fec_stop(struct net_device *ndev) static void -fec_timeout(struct net_device *ndev) +fec_timeout(struct net_device *ndev, unsigned int txqueue) { struct fec_enet_private *fep = netdev_priv(ndev); diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index 30cdb246d020..de5278485062 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -84,7 +84,7 @@ static int debug = -1; /* the above default */ module_param(debug, int, 0); MODULE_PARM_DESC(debug, "debugging messages level"); -static void mpc52xx_fec_tx_timeout(struct net_device *dev) +static void mpc52xx_fec_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mpc52xx_fec_priv *priv = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 3981c06f082f..80903cd58468 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -641,7 +641,7 @@ static void fs_timeout_work(struct work_struct *work) netif_wake_queue(dev); } -static void fs_timeout(struct net_device *dev) +static void fs_timeout(struct net_device *dev, unsigned int txqueue) { struct fs_enet_private *fep = netdev_priv(dev); diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 72868a28b621..b636d83a7ee9 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -2093,7 +2093,7 @@ static void gfar_reset_task(struct work_struct *work) reset_gfar(priv->ndev); } -static void gfar_timeout(struct net_device *dev) +static void gfar_timeout(struct net_device *dev, unsigned int txqueue) { struct gfar_private *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index f839fa94ebdd..0d101c00286f 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3545,7 +3545,7 @@ static void ucc_geth_timeout_work(struct work_struct *work) * ucc_geth_timeout gets called when a packet has not been * transmitted after a set amount of time. */ -static void ucc_geth_timeout(struct net_device *dev) +static void ucc_geth_timeout(struct net_device *dev, unsigned int txqueue) { struct ucc_geth_private *ugeth = netdev_priv(dev); diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c index 1eca0fdb9933..a7b7a4aace79 100644 --- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c +++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c @@ -93,7 +93,7 @@ static irqreturn_t fjn_interrupt(int irq, void *dev_id); static void fjn_rx(struct net_device *dev); static void fjn_reset(struct net_device *dev); static void set_rx_mode(struct net_device *dev); -static void fjn_tx_timeout(struct net_device *dev); +static void fjn_tx_timeout(struct net_device *dev, unsigned int txqueue); static const struct ethtool_ops netdev_ethtool_ops; /* @@ -774,7 +774,7 @@ static irqreturn_t fjn_interrupt(int dummy, void *dev_id) /*====================================================================*/ -static void fjn_tx_timeout(struct net_device *dev) +static void fjn_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct local_info *lp = netdev_priv(dev); unsigned int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 9b7a8db9860f..e032563ceefd 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -845,7 +845,7 @@ static void gve_turnup(struct gve_priv *priv) gve_set_napi_enabled(priv); } -static void gve_tx_timeout(struct net_device *dev) +static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct gve_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index 3e9b6d543c77..dc8dd5fc1559 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -779,7 +779,7 @@ static int hip04_mac_stop(struct net_device *ndev) return 0; } -static void hip04_timeout(struct net_device *ndev) +static void hip04_timeout(struct net_device *ndev, unsigned int txqueue) { struct hip04_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c index 247de9105d10..4fb776920a93 100644 --- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c +++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c @@ -893,7 +893,7 @@ static void hix5hd2_tx_timeout_task(struct work_struct *work) hix5hd2_net_open(priv->netdev); } -static void hix5hd2_net_timeout(struct net_device *dev) +static void hix5hd2_net_timeout(struct net_device *dev, unsigned int txqueue) { struct hix5hd2_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index 14ab20491fd0..e45553ec114a 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -1485,7 +1485,7 @@ static int hns_nic_net_stop(struct net_device *ndev) static void hns_tx_timeout_reset(struct hns_nic_priv *priv); #define HNS_TX_TIMEO_LIMIT (40 * HZ) -static void hns_nic_net_timeout(struct net_device *ndev) +static void hns_nic_net_timeout(struct net_device *ndev, unsigned int txqueue) { struct hns_nic_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 69545dd6c938..b113b895dab7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1869,7 +1869,7 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) return true; } -static void hns3_nic_net_timeout(struct net_device *ndev) +static void hns3_nic_net_timeout(struct net_device *ndev, unsigned int txqueue) { struct hns3_nic_priv *priv = netdev_priv(ndev); struct hnae3_handle *h = priv->ae_handle; diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 2411ad270c98..02a14f5e7fe3 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -766,7 +766,7 @@ static void hinic_set_rx_mode(struct net_device *netdev) queue_work(nic_dev->workq, &rx_mode_work->work); } -static void hinic_tx_timeout(struct net_device *netdev) +static void hinic_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct hinic_dev *nic_dev = netdev_priv(netdev); diff --git a/drivers/net/ethernet/i825xx/82596.c b/drivers/net/ethernet/i825xx/82596.c index 92929750f832..bef676d93339 100644 --- a/drivers/net/ethernet/i825xx/82596.c +++ b/drivers/net/ethernet/i825xx/82596.c @@ -363,7 +363,7 @@ static netdev_tx_t i596_start_xmit(struct sk_buff *skb, struct net_device *dev); static irqreturn_t i596_interrupt(int irq, void *dev_id); static int i596_close(struct net_device *dev); static void i596_add_cmd(struct net_device *dev, struct i596_cmd *cmd); -static void i596_tx_timeout (struct net_device *dev); +static void i596_tx_timeout (struct net_device *dev, unsigned int txqueue); static void print_eth(unsigned char *buf, char *str); static void set_multicast_list(struct net_device *dev); @@ -1019,7 +1019,7 @@ err_irq_dev: return res; } -static void i596_tx_timeout (struct net_device *dev) +static void i596_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct i596_private *lp = dev->ml_priv; int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index bb3b8adbe4f0..a0bfb509e002 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -66,7 +66,7 @@ static netdev_tx_t ether1_sendpacket(struct sk_buff *skb, static irqreturn_t ether1_interrupt(int irq, void *dev_id); static int ether1_close(struct net_device *dev); static void ether1_setmulticastlist(struct net_device *dev); -static void ether1_timeout(struct net_device *dev); +static void ether1_timeout(struct net_device *dev, unsigned int txqueue); /* ------------------------------------------------------------------------- */ @@ -650,7 +650,7 @@ ether1_open (struct net_device *dev) } static void -ether1_timeout(struct net_device *dev) +ether1_timeout(struct net_device *dev, unsigned int txqueue) { printk(KERN_WARNING "%s: transmit timeout, network cable problem?\n", dev->name); diff --git a/drivers/net/ethernet/i825xx/lib82596.c b/drivers/net/ethernet/i825xx/lib82596.c index f9742af7f142..b03757e169e4 100644 --- a/drivers/net/ethernet/i825xx/lib82596.c +++ b/drivers/net/ethernet/i825xx/lib82596.c @@ -351,7 +351,7 @@ static netdev_tx_t i596_start_xmit(struct sk_buff *skb, struct net_device *dev); static irqreturn_t i596_interrupt(int irq, void *dev_id); static int i596_close(struct net_device *dev); static void i596_add_cmd(struct net_device *dev, struct i596_cmd *cmd); -static void i596_tx_timeout (struct net_device *dev); +static void i596_tx_timeout (struct net_device *dev, unsigned int txqueue); static void print_eth(unsigned char *buf, char *str); static void set_multicast_list(struct net_device *dev); static inline void ca(struct net_device *dev); @@ -936,7 +936,7 @@ out_remove_rx_bufs: return -EAGAIN; } -static void i596_tx_timeout (struct net_device *dev) +static void i596_tx_timeout (struct net_device *dev, unsigned int txqueue) { struct i596_private *lp = netdev_priv(dev); diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c index 1a86184d44c0..4564ee02c95f 100644 --- a/drivers/net/ethernet/i825xx/sun3_82586.c +++ b/drivers/net/ethernet/i825xx/sun3_82586.c @@ -125,7 +125,7 @@ static netdev_tx_t sun3_82586_send_packet(struct sk_buff *, struct net_device *); static struct net_device_stats *sun3_82586_get_stats(struct net_device *dev); static void set_multicast_list(struct net_device *dev); -static void sun3_82586_timeout(struct net_device *dev); +static void sun3_82586_timeout(struct net_device *dev, unsigned int txqueue); #if 0 static void sun3_82586_dump(struct net_device *,void *); #endif @@ -965,7 +965,7 @@ static void startrecv586(struct net_device *dev) WAIT_4_SCB_CMD_RUC(); /* wait for accept cmd. (no timeout!!) */ } -static void sun3_82586_timeout(struct net_device *dev) +static void sun3_82586_timeout(struct net_device *dev, unsigned int txqueue) { struct priv *p = netdev_priv(dev); #ifndef NO_NOPCOMMANDS diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index 13e30eba5349..0273fb7a9d01 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -2786,7 +2786,7 @@ out: return; } -static void ehea_tx_watchdog(struct net_device *dev) +static void ehea_tx_watchdog(struct net_device *dev, unsigned int txqueue) { struct ehea_port *port = netdev_priv(dev); diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 2e40425d8a34..b7fc17756c51 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -776,7 +776,7 @@ static void emac_reset_work(struct work_struct *work) mutex_unlock(&dev->link_lock); } -static void emac_tx_timeout(struct net_device *ndev) +static void emac_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct emac_instance *dev = netdev_priv(ndev); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index c90080781924..94b9d8913b66 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2282,7 +2282,7 @@ err: return -ret; } -static void ibmvnic_tx_timeout(struct net_device *dev) +static void ibmvnic_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct ibmvnic_adapter *adapter = netdev_priv(dev); diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index a65d5a9ba7db..1b8d015ebfb0 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -2316,7 +2316,7 @@ static void e100_down(struct nic *nic) e100_rx_clean_list(nic); } -static void e100_tx_timeout(struct net_device *netdev) +static void e100_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct nic *nic = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index aca97b084003..2bced34c19ba 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -134,7 +134,7 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd); static void e1000_enter_82542_rst(struct e1000_adapter *adapter); static void e1000_leave_82542_rst(struct e1000_adapter *adapter); -static void e1000_tx_timeout(struct net_device *dev); +static void e1000_tx_timeout(struct net_device *dev, unsigned int txqueue); static void e1000_reset_task(struct work_struct *work); static void e1000_smartspeed(struct e1000_adapter *adapter); static int e1000_82547_fifo_workaround(struct e1000_adapter *adapter, @@ -3488,7 +3488,7 @@ exit: * e1000_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void e1000_tx_timeout(struct net_device *netdev) +static void e1000_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct e1000_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index fe7997c18a10..4c220600ea9a 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5929,7 +5929,7 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, * e1000_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void e1000_tx_timeout(struct net_device *netdev) +static void e1000_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct e1000_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index 68baee04dc58..ba2566e2123d 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -697,7 +697,7 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev) * fm10k_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void fm10k_tx_timeout(struct net_device *netdev) +static void fm10k_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct fm10k_intfc *interface = netdev_priv(netdev); bool real_tx_hang = false; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1ccabeafa44c..4c9ac6c80eb8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -301,7 +301,7 @@ void i40e_service_event_schedule(struct i40e_pf *pf) * device is munged, not just the one netdev port, so go for the full * reset. **/ -static void i40e_tx_timeout(struct net_device *netdev) +static void i40e_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct i40e_netdev_priv *np = netdev_priv(netdev); struct i40e_vsi *vsi = np->vsi; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 821987da5698..0a8824871618 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -159,7 +159,7 @@ void iavf_schedule_reset(struct iavf_adapter *adapter) * iavf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void iavf_tx_timeout(struct net_device *netdev) +static void iavf_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct iavf_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 69bff085acf7..4d5220c9c721 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5060,7 +5060,7 @@ ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, * ice_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure */ -static void ice_tx_timeout(struct net_device *netdev) +static void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_ring *tx_ring = NULL; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 98346eb064d5..d11e64a58ed1 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -146,7 +146,7 @@ static int igb_poll(struct napi_struct *, int); static bool igb_clean_tx_irq(struct igb_q_vector *, int); static int igb_clean_rx_irq(struct igb_q_vector *, int); static int igb_ioctl(struct net_device *, struct ifreq *, int cmd); -static void igb_tx_timeout(struct net_device *); +static void igb_tx_timeout(struct net_device *, unsigned int txqueue); static void igb_reset_task(struct work_struct *); static void igb_vlan_mode(struct net_device *netdev, netdev_features_t features); @@ -6184,7 +6184,7 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb, * igb_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void igb_tx_timeout(struct net_device *netdev) +static void igb_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct igb_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c index 6003dc3ff5fd..5b1800c3ba82 100644 --- a/drivers/net/ethernet/intel/igbvf/netdev.c +++ b/drivers/net/ethernet/intel/igbvf/netdev.c @@ -2375,7 +2375,7 @@ static netdev_tx_t igbvf_xmit_frame(struct sk_buff *skb, * igbvf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void igbvf_tx_timeout(struct net_device *netdev) +static void igbvf_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct igbvf_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c index 3d8c051dd327..b64e91ea3465 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c @@ -70,7 +70,7 @@ static int ixgb_clean(struct napi_struct *, int); static bool ixgb_clean_rx_irq(struct ixgb_adapter *, int *, int); static void ixgb_alloc_rx_buffers(struct ixgb_adapter *, int); -static void ixgb_tx_timeout(struct net_device *dev); +static void ixgb_tx_timeout(struct net_device *dev, unsigned int txqueue); static void ixgb_tx_timeout_task(struct work_struct *work); static void ixgb_vlan_strip_enable(struct ixgb_adapter *adapter); @@ -1538,7 +1538,7 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) **/ static void -ixgb_tx_timeout(struct net_device *netdev) +ixgb_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ixgb_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c index 171cdc552961..5b1cf49df3d3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c @@ -166,7 +166,9 @@ static ssize_t ixgbe_dbg_netdev_ops_write(struct file *filp, ixgbe_dbg_netdev_ops_buf[len] = '\0'; if (strncmp(ixgbe_dbg_netdev_ops_buf, "tx_timeout", 10) == 0) { - adapter->netdev->netdev_ops->ndo_tx_timeout(adapter->netdev); + /* TX Queue number below is wrong, but ixgbe does not use it */ + adapter->netdev->netdev_ops->ndo_tx_timeout(adapter->netdev, + UINT_MAX); e_dev_info("tx_timeout called\n"); } else { e_dev_info("Unknown command: %s\n", ixgbe_dbg_netdev_ops_buf); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 25c097cd8100..8129ea2e94a8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6158,7 +6158,7 @@ static void ixgbe_set_eee_capable(struct ixgbe_adapter *adapter) * ixgbe_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void ixgbe_tx_timeout(struct net_device *netdev) +static void ixgbe_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ixgbe_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 076f2da36f27..fa286694ac2c 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -250,7 +250,7 @@ static void ixgbevf_tx_timeout_reset(struct ixgbevf_adapter *adapter) * ixgbevf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure **/ -static void ixgbevf_tx_timeout(struct net_device *netdev) +static void ixgbevf_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ixgbevf_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 25aa400e2e3c..2e4975572e9f 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -2337,7 +2337,7 @@ jme_change_mtu(struct net_device *netdev, int new_mtu) } static void -jme_tx_timeout(struct net_device *netdev) +jme_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct jme_adapter *jme = netdev_priv(netdev); diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index ae195f8adff5..f98d9d627c71 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -917,7 +917,7 @@ static void korina_restart_task(struct work_struct *work) enable_irq(lp->rx_irq); } -static void korina_tx_timeout(struct net_device *dev) +static void korina_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct korina_private *lp = netdev_priv(dev); diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 6e73ffe6f928..028e3e6222e9 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -594,7 +594,7 @@ err_hw: } static void -ltq_etop_tx_timeout(struct net_device *dev) +ltq_etop_tx_timeout(struct net_device *dev, unsigned int txqueue) { int err; diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index d5b644131cff..c43820597218 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2590,7 +2590,7 @@ static void tx_timeout_task(struct work_struct *ugly) } } -static void mv643xx_eth_tx_timeout(struct net_device *dev) +static void mv643xx_eth_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mv643xx_eth_private *mp = netdev_priv(dev); diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 3fb7ee3d4d13..1a6877902dd6 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -742,7 +742,7 @@ txq_reclaim_end: return released; } -static void pxa168_eth_tx_timeout(struct net_device *dev) +static void pxa168_eth_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct pxa168_eth_private *pep = netdev_priv(dev); diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 095f6c71b4fa..8ca15958e752 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -2884,7 +2884,7 @@ static void skge_tx_clean(struct net_device *dev) skge->tx_ring.to_clean = e; } -static void skge_tx_timeout(struct net_device *dev) +static void skge_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct skge_port *skge = netdev_priv(dev); diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 5f56ee83e3b1..acd1cba987fb 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -2358,7 +2358,7 @@ static void sky2_qlink_intr(struct sky2_hw *hw) /* Transmit timeout is only called if we are running, carrier is up * and tx queue is full (stopped). */ -static void sky2_tx_timeout(struct net_device *dev) +static void sky2_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct sky2_port *sky2 = netdev_priv(dev); struct sky2_hw *hw = sky2->hw; diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 527ad2aadcca..8c6cfd15481c 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2081,7 +2081,7 @@ static void mtk_dma_free(struct mtk_eth *eth) kfree(eth->scratch_head); } -static void mtk_tx_timeout(struct net_device *dev) +static void mtk_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mtk_mac *mac = netdev_priv(dev); struct mtk_eth *eth = mac->hw; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 7af75b63245f..71c083960a87 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1363,7 +1363,7 @@ static void mlx4_en_delete_rss_steer_rules(struct mlx4_en_priv *priv) } } -static void mlx4_en_tx_timeout(struct net_device *dev) +static void mlx4_en_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 4980e80a5e85..68f1c8cb302b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4332,7 +4332,7 @@ unlock: rtnl_unlock(); } -static void mlx5e_tx_timeout(struct net_device *dev) +static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mlx5e_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/micrel/ks8842.c b/drivers/net/ethernet/micrel/ks8842.c index da329ca115cc..f3f6dfe3eddc 100644 --- a/drivers/net/ethernet/micrel/ks8842.c +++ b/drivers/net/ethernet/micrel/ks8842.c @@ -1103,7 +1103,7 @@ static void ks8842_tx_timeout_work(struct work_struct *work) __ks8842_start_new_rx_dma(netdev); } -static void ks8842_tx_timeout(struct net_device *netdev) +static void ks8842_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ks8842_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c index e102e1560ac7..d1444ba36e10 100644 --- a/drivers/net/ethernet/micrel/ksz884x.c +++ b/drivers/net/ethernet/micrel/ksz884x.c @@ -4896,7 +4896,7 @@ unlock: * triggered to free up resources so that the transmit routine can continue * sending out packets. The hardware is reset to correct the problem. */ -static void netdev_tx_timeout(struct net_device *dev) +static void netdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { static unsigned long last_reset; diff --git a/drivers/net/ethernet/microchip/enc28j60.c b/drivers/net/ethernet/microchip/enc28j60.c index 0567e4f387a5..09cdc2f2e7ff 100644 --- a/drivers/net/ethernet/microchip/enc28j60.c +++ b/drivers/net/ethernet/microchip/enc28j60.c @@ -1325,7 +1325,7 @@ static irqreturn_t enc28j60_irq(int irq, void *dev_id) return IRQ_HANDLED; } -static void enc28j60_tx_timeout(struct net_device *ndev) +static void enc28j60_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct enc28j60_net *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index 52c41d11f565..39925e4bf2ec 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -892,7 +892,7 @@ static netdev_tx_t encx24j600_tx(struct sk_buff *skb, struct net_device *dev) } /* Deal with a transmit timeout */ -static void encx24j600_tx_timeout(struct net_device *dev) +static void encx24j600_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct encx24j600_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c index 1a2634cbbb69..d21d706b83a7 100644 --- a/drivers/net/ethernet/natsemi/natsemi.c +++ b/drivers/net/ethernet/natsemi/natsemi.c @@ -612,7 +612,7 @@ static void undo_cable_magic(struct net_device *dev); static void check_link(struct net_device *dev); static void netdev_timer(struct timer_list *t); static void dump_ring(struct net_device *dev); -static void ns_tx_timeout(struct net_device *dev); +static void ns_tx_timeout(struct net_device *dev, unsigned int txqueue); static int alloc_ring(struct net_device *dev); static void refill_rx(struct net_device *dev); static void init_ring(struct net_device *dev); @@ -1881,7 +1881,7 @@ static void dump_ring(struct net_device *dev) } } -static void ns_tx_timeout(struct net_device *dev) +static void ns_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct netdev_private *np = netdev_priv(dev); void __iomem * ioaddr = ns_ioaddr(dev); diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 6af9a7eee114..be5f62f06785 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -1549,7 +1549,7 @@ static int ns83820_stop(struct net_device *ndev) return 0; } -static void ns83820_tx_timeout(struct net_device *ndev) +static void ns83820_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct ns83820 *dev = PRIV(ndev); u32 tx_done_idx; @@ -1603,7 +1603,7 @@ static void ns83820_tx_watch(struct timer_list *t) ndev->name, dev->tx_done_idx, dev->tx_free_idx, atomic_read(&dev->nr_tx_skbs)); - ns83820_tx_timeout(ndev); + ns83820_tx_timeout(ndev, UINT_MAX); } mod_timer(&dev->tx_watchdog, jiffies + 2*HZ); diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c index b339125b2f09..fdebc8598b22 100644 --- a/drivers/net/ethernet/natsemi/sonic.c +++ b/drivers/net/ethernet/natsemi/sonic.c @@ -161,7 +161,7 @@ static int sonic_close(struct net_device *dev) return 0; } -static void sonic_tx_timeout(struct net_device *dev) +static void sonic_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct sonic_local *lp = netdev_priv(dev); int i; diff --git a/drivers/net/ethernet/natsemi/sonic.h b/drivers/net/ethernet/natsemi/sonic.h index 2b27f7049acb..f1544481aac1 100644 --- a/drivers/net/ethernet/natsemi/sonic.h +++ b/drivers/net/ethernet/natsemi/sonic.h @@ -336,7 +336,7 @@ static int sonic_close(struct net_device *dev); static struct net_device_stats *sonic_get_stats(struct net_device *dev); static void sonic_multicast_list(struct net_device *dev); static int sonic_init(struct net_device *dev); -static void sonic_tx_timeout(struct net_device *dev); +static void sonic_tx_timeout(struct net_device *dev, unsigned int txqueue); static void sonic_msg_init(struct net_device *dev); /* Internal inlines for reading/writing DMA buffers. Note that bus diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index e0b2bf327905..0ec6b8e8b549 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -7238,7 +7238,7 @@ out_unlock: * void */ -static void s2io_tx_watchdog(struct net_device *dev) +static void s2io_tx_watchdog(struct net_device *dev, unsigned int txqueue) { struct s2io_nic *sp = netdev_priv(dev); struct swStat *swstats = &sp->mac_control.stats_info->sw_stat; diff --git a/drivers/net/ethernet/neterion/s2io.h b/drivers/net/ethernet/neterion/s2io.h index 0a921f30f98f..6fa3159a977f 100644 --- a/drivers/net/ethernet/neterion/s2io.h +++ b/drivers/net/ethernet/neterion/s2io.h @@ -1065,7 +1065,7 @@ static void s2io_txpic_intr_handle(struct s2io_nic *sp); static void tx_intr_handler(struct fifo_info *fifo_data); static void s2io_handle_errors(void * dev_id); -static void s2io_tx_watchdog(struct net_device *dev); +static void s2io_tx_watchdog(struct net_device *dev, unsigned int txqueue); static void s2io_set_multicast(struct net_device *dev); static int rx_osm_handler(struct ring_info *ring_data, struct RxD_t * rxdp); static void s2io_link(struct s2io_nic * sp, int link); diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c index 1d334f2e0a56..9b63574b6202 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-main.c +++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c @@ -3273,7 +3273,7 @@ static int vxge_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) * This function is triggered if the Tx Queue is stopped * for a pre-defined amount of time when the Interface is still up. */ -static void vxge_tx_watchdog(struct net_device *dev) +static void vxge_tx_watchdog(struct net_device *dev, unsigned int txqueue) { struct vxgedev *vdev; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index bcdcd6de7dea..bd305fc6ed5a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1321,7 +1321,7 @@ nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring) netdev_tx_reset_queue(nd_q); } -static void nfp_net_tx_timeout(struct net_device *netdev) +static void nfp_net_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct nfp_net *nn = netdev_priv(netdev); int i; diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 6b54cb3b681d..2fc10a36afa4 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -2739,7 +2739,7 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit) * nv_tx_timeout: dev->tx_timeout function * Called with netif_tx_lock held. */ -static void nv_tx_timeout(struct net_device *dev) +static void nv_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct fe_priv *np = netdev_priv(dev); u8 __iomem *base = get_hwbase(dev); diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c index 18e6d87c607b..73ec195fbc30 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c @@ -2271,7 +2271,7 @@ static int pch_gbe_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) * pch_gbe_tx_timeout - Respond to a Tx Hang * @netdev: Network interface device structure */ -static void pch_gbe_tx_timeout(struct net_device *netdev) +static void pch_gbe_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct pch_gbe_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c index eee883a2aa8d..70816d2e2990 100644 --- a/drivers/net/ethernet/packetengines/hamachi.c +++ b/drivers/net/ethernet/packetengines/hamachi.c @@ -548,7 +548,7 @@ static void mdio_write(struct net_device *dev, int phy_id, int location, int val static int hamachi_open(struct net_device *dev); static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void hamachi_timer(struct timer_list *t); -static void hamachi_tx_timeout(struct net_device *dev); +static void hamachi_tx_timeout(struct net_device *dev, unsigned int txqueue); static void hamachi_init_ring(struct net_device *dev); static netdev_tx_t hamachi_start_xmit(struct sk_buff *skb, struct net_device *dev); @@ -1042,7 +1042,7 @@ static void hamachi_timer(struct timer_list *t) add_timer(&hmp->timer); } -static void hamachi_tx_timeout(struct net_device *dev) +static void hamachi_tx_timeout(struct net_device *dev, unsigned int txqueue) { int i; struct hamachi_private *hmp = netdev_priv(dev); diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c index 5113ee647090..520779f05e1a 100644 --- a/drivers/net/ethernet/packetengines/yellowfin.c +++ b/drivers/net/ethernet/packetengines/yellowfin.c @@ -344,7 +344,7 @@ static void mdio_write(void __iomem *ioaddr, int phy_id, int location, int value static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static int yellowfin_open(struct net_device *dev); static void yellowfin_timer(struct timer_list *t); -static void yellowfin_tx_timeout(struct net_device *dev); +static void yellowfin_tx_timeout(struct net_device *dev, unsigned int txqueue); static int yellowfin_init_ring(struct net_device *dev); static netdev_tx_t yellowfin_start_xmit(struct sk_buff *skb, struct net_device *dev); @@ -677,7 +677,7 @@ static void yellowfin_timer(struct timer_list *t) add_timer(&yp->timer); } -static void yellowfin_tx_timeout(struct net_device *dev) +static void yellowfin_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct yellowfin_private *yp = netdev_priv(dev); void __iomem *ioaddr = yp->base; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index ef8258713369..a76108a9c7db 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1285,7 +1285,7 @@ static void ionic_tx_timeout_work(struct work_struct *ws) rtnl_unlock(); } -static void ionic_tx_timeout(struct net_device *netdev) +static void ionic_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct ionic_lif *lif = netdev_priv(netdev); diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index c692a41e4548..8067ea04d455 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -49,7 +49,7 @@ static int netxen_nic_open(struct net_device *netdev); static int netxen_nic_close(struct net_device *netdev); static netdev_tx_t netxen_nic_xmit_frame(struct sk_buff *, struct net_device *); -static void netxen_tx_timeout(struct net_device *netdev); +static void netxen_tx_timeout(struct net_device *netdev, unsigned int txqueue); static void netxen_tx_timeout_task(struct work_struct *work); static void netxen_fw_poll_work(struct work_struct *work); static void netxen_schedule_work(struct netxen_adapter *adapter, @@ -2222,7 +2222,7 @@ static void netxen_nic_handle_phy_intr(struct netxen_adapter *adapter) netxen_advert_link_change(adapter, linkup); } -static void netxen_tx_timeout(struct net_device *netdev) +static void netxen_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct netxen_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index b4b8ba00ee01..bb864765c761 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -3602,7 +3602,7 @@ static int ql3xxx_set_mac_address(struct net_device *ndev, void *p) return 0; } -static void ql3xxx_tx_timeout(struct net_device *ndev) +static void ql3xxx_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct ql3_adapter *qdev = netdev_priv(ndev); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index c07438db30ba..9dd6cb36f366 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -56,7 +56,7 @@ static int qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent); static void qlcnic_remove(struct pci_dev *pdev); static int qlcnic_open(struct net_device *netdev); static int qlcnic_close(struct net_device *netdev); -static void qlcnic_tx_timeout(struct net_device *netdev); +static void qlcnic_tx_timeout(struct net_device *netdev, unsigned int txqueue); static void qlcnic_attach_work(struct work_struct *work); static void qlcnic_fwinit_work(struct work_struct *work); @@ -3068,7 +3068,7 @@ static void qlcnic_dump_rings(struct qlcnic_adapter *adapter) } -static void qlcnic_tx_timeout(struct net_device *netdev) +static void qlcnic_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct qlcnic_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 98f92268cbaa..522fad4cb2cd 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -282,7 +282,7 @@ static int emac_close(struct net_device *netdev) } /* Respond to a TX hang */ -static void emac_tx_timeout(struct net_device *netdev) +static void emac_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct emac_adapter *adpt = netdev_priv(netdev); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index baac016f3ec0..5a3b65a6eb4f 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -785,7 +785,7 @@ qcaspi_netdev_xmit(struct sk_buff *skb, struct net_device *dev) } static void -qcaspi_netdev_tx_timeout(struct net_device *dev) +qcaspi_netdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct qcaspi *qca = netdev_priv(dev); diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c index 0981068504fa..375a844cd27c 100644 --- a/drivers/net/ethernet/qualcomm/qca_uart.c +++ b/drivers/net/ethernet/qualcomm/qca_uart.c @@ -248,7 +248,7 @@ out: return NETDEV_TX_OK; } -static void qcauart_netdev_tx_timeout(struct net_device *dev) +static void qcauart_netdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct qcauart *qca = netdev_priv(dev); diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c index 274e5b4bc4ac..c23cb61bbd30 100644 --- a/drivers/net/ethernet/rdc/r6040.c +++ b/drivers/net/ethernet/rdc/r6040.c @@ -410,7 +410,7 @@ static void r6040_init_mac_regs(struct net_device *dev) iowrite16(TM2TX, ioaddr + MTPR); } -static void r6040_tx_timeout(struct net_device *dev) +static void r6040_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct r6040_private *priv = netdev_priv(dev); void __iomem *ioaddr = priv->base; diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 4f910c4f67b0..60d342f82fb3 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -1235,7 +1235,7 @@ static int cp_close (struct net_device *dev) return 0; } -static void cp_tx_timeout(struct net_device *dev) +static void cp_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct cp_private *cp = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index 55d01266e615..5caeb8368eab 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -642,7 +642,7 @@ static int mdio_read (struct net_device *dev, int phy_id, int location); static void mdio_write (struct net_device *dev, int phy_id, int location, int val); static void rtl8139_start_thread(struct rtl8139_private *tp); -static void rtl8139_tx_timeout (struct net_device *dev); +static void rtl8139_tx_timeout (struct net_device *dev, unsigned int txqueue); static void rtl8139_init_ring (struct net_device *dev); static netdev_tx_t rtl8139_start_xmit (struct sk_buff *skb, struct net_device *dev); @@ -1700,7 +1700,7 @@ static void rtl8139_tx_timeout_task (struct work_struct *work) spin_unlock_bh(&tp->rx_lock); } -static void rtl8139_tx_timeout (struct net_device *dev) +static void rtl8139_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct rtl8139_private *tp = netdev_priv(dev); diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c index 58e0ca9093d3..9e3b35c97e63 100644 --- a/drivers/net/ethernet/realtek/atp.c +++ b/drivers/net/ethernet/realtek/atp.c @@ -204,7 +204,7 @@ static void net_rx(struct net_device *dev); static void read_block(long ioaddr, int length, unsigned char *buffer, int data_mode); static int net_close(struct net_device *dev); static void set_rx_mode(struct net_device *dev); -static void tx_timeout(struct net_device *dev); +static void tx_timeout(struct net_device *dev, unsigned int txqueue); /* A list of all installed ATP devices, for removing the driver module. */ @@ -533,7 +533,7 @@ static void write_packet(long ioaddr, int length, unsigned char *packet, int pad outb(Ctrl_HNibWrite | Ctrl_SelData | Ctrl_IRQEN, ioaddr + PAR_CONTROL); } -static void tx_timeout(struct net_device *dev) +static void tx_timeout(struct net_device *dev, unsigned int txqueue) { long ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 67a4d5d45e3a..2cfaa2270319 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5435,7 +5435,7 @@ static void rtl_reset_work(struct rtl8169_private *tp) netif_wake_queue(dev); } -static void rtl8169_tx_timeout(struct net_device *dev) +static void rtl8169_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct rtl8169_private *tp = netdev_priv(dev); diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 4b13a184bfc7..067ad25553b9 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1425,7 +1425,7 @@ out_napi_off: } /* Timeout function for Ethernet AVB */ -static void ravb_tx_timeout(struct net_device *ndev) +static void ravb_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct ravb_private *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index e19b49c4013e..cdd8ab2eb910 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2478,7 +2478,7 @@ out_napi_off: } /* Timeout function */ -static void sh_eth_tx_timeout(struct net_device *ndev) +static void sh_eth_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct sh_eth_private *mdp = netdev_priv(ndev); struct sh_eth_rxdesc *rxdesc; diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index c56fcbb37066..cd6e0de48248 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -1572,7 +1572,7 @@ static int sxgbe_poll(struct napi_struct *napi, int budget) * netdev structure and arrange for the device to be reset to a sane state * in order to transmit a new packet. */ -static void sxgbe_tx_timeout(struct net_device *dev) +static void sxgbe_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct sxgbe_priv_data *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index 632a7c85964d..128ee7cda1ed 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -79,7 +79,7 @@ static netdev_tx_t ether3_sendpacket(struct sk_buff *skb, static irqreturn_t ether3_interrupt (int irq, void *dev_id); static int ether3_close (struct net_device *dev); static void ether3_setmulticastlist (struct net_device *dev); -static void ether3_timeout(struct net_device *dev); +static void ether3_timeout(struct net_device *dev, unsigned int txqueue); #define BUS_16 2 #define BUS_8 1 @@ -450,7 +450,7 @@ static void ether3_setmulticastlist(struct net_device *dev) ether3_outw(priv(dev)->regs.config1 | CFG1_LOCBUFMEM, REG_CONFIG1); } -static void ether3_timeout(struct net_device *dev) +static void ether3_timeout(struct net_device *dev, unsigned int txqueue) { unsigned long flags; diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c index 276c7cae7cee..8507ff242014 100644 --- a/drivers/net/ethernet/seeq/sgiseeq.c +++ b/drivers/net/ethernet/seeq/sgiseeq.c @@ -645,7 +645,7 @@ sgiseeq_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static void timeout(struct net_device *dev) +static void timeout(struct net_device *dev, unsigned int txqueue) { printk(KERN_NOTICE "%s: transmit timed out, resetting\n", dev->name); sgiseeq_reset(dev); diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 992c773620ec..f358709fab67 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2395,7 +2395,7 @@ static void efx_net_stats(struct net_device *net_dev, } /* Context: netif_tx_lock held, BHs disabled. */ -static void efx_watchdog(struct net_device *net_dev) +static void efx_watchdog(struct net_device *net_dev, unsigned int txqueue) { struct efx_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index eecc348b1c32..bee4cd9d7135 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -2108,7 +2108,7 @@ static void ef4_net_stats(struct net_device *net_dev, } /* Context: netif_tx_lock held, BHs disabled. */ -static void ef4_watchdog(struct net_device *net_dev) +static void ef4_watchdog(struct net_device *net_dev, unsigned int txqueue) { struct ef4_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index d242906ae233..06637b03deed 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -114,7 +114,7 @@ struct ioc3_private { static int ioc3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void ioc3_set_multicast_list(struct net_device *dev); static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev); -static void ioc3_timeout(struct net_device *dev); +static void ioc3_timeout(struct net_device *dev, unsigned int txqueue); static inline unsigned int ioc3_hash(const unsigned char *addr); static void ioc3_start(struct ioc3_private *ip); static inline void ioc3_stop(struct ioc3_private *ip); @@ -1479,7 +1479,7 @@ drop_packet: return NETDEV_TX_OK; } -static void ioc3_timeout(struct net_device *dev) +static void ioc3_timeout(struct net_device *dev, unsigned int txqueue) { struct ioc3_private *ip = netdev_priv(dev); diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c index 539bc5db989c..0c396ecd3389 100644 --- a/drivers/net/ethernet/sgi/meth.c +++ b/drivers/net/ethernet/sgi/meth.c @@ -90,7 +90,7 @@ struct meth_private { spinlock_t meth_lock; }; -static void meth_tx_timeout(struct net_device *dev); +static void meth_tx_timeout(struct net_device *dev, unsigned int txqueue); static irqreturn_t meth_interrupt(int irq, void *dev_id); /* global, initialized in ip32-setup.c */ @@ -727,7 +727,7 @@ static netdev_tx_t meth_tx(struct sk_buff *skb, struct net_device *dev) /* * Deal with a transmit timeout. */ -static void meth_tx_timeout(struct net_device *dev) +static void meth_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct meth_private *priv = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/silan/sc92031.c b/drivers/net/ethernet/silan/sc92031.c index c7641a236eb8..cb043eb1bdc1 100644 --- a/drivers/net/ethernet/silan/sc92031.c +++ b/drivers/net/ethernet/silan/sc92031.c @@ -1078,7 +1078,7 @@ static void sc92031_set_multicast_list(struct net_device *dev) spin_unlock_bh(&priv->lock); } -static void sc92031_tx_timeout(struct net_device *dev) +static void sc92031_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct sc92031_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/sis/sis190.c b/drivers/net/ethernet/sis/sis190.c index 5b351beb78cb..5a4b6e3ab38f 100644 --- a/drivers/net/ethernet/sis/sis190.c +++ b/drivers/net/ethernet/sis/sis190.c @@ -1538,7 +1538,7 @@ err_out_0: goto out; } -static void sis190_tx_timeout(struct net_device *dev) +static void sis190_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct sis190_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->mmio_addr; diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c index 85eaccbbbac1..81ed7589e33c 100644 --- a/drivers/net/ethernet/sis/sis900.c +++ b/drivers/net/ethernet/sis/sis900.c @@ -222,7 +222,7 @@ static int mdio_read(struct net_device *net_dev, int phy_id, int location); static void mdio_write(struct net_device *net_dev, int phy_id, int location, int val); static void sis900_timer(struct timer_list *t); static void sis900_check_mode (struct net_device *net_dev, struct mii_phy *mii_phy); -static void sis900_tx_timeout(struct net_device *net_dev); +static void sis900_tx_timeout(struct net_device *net_dev, unsigned int txqueue); static void sis900_init_tx_ring(struct net_device *net_dev); static void sis900_init_rx_ring(struct net_device *net_dev); static netdev_tx_t sis900_start_xmit(struct sk_buff *skb, @@ -1537,7 +1537,7 @@ static void sis900_read_mode(struct net_device *net_dev, int *speed, int *duplex * disable interrupts and do some tasks */ -static void sis900_tx_timeout(struct net_device *net_dev) +static void sis900_tx_timeout(struct net_device *net_dev, unsigned int txqueue) { struct sis900_private *sis_priv = netdev_priv(net_dev); void __iomem *ioaddr = sis_priv->ioaddr; diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c index be47d864f8b9..912760e8514c 100644 --- a/drivers/net/ethernet/smsc/epic100.c +++ b/drivers/net/ethernet/smsc/epic100.c @@ -291,7 +291,7 @@ static int mdio_read(struct net_device *dev, int phy_id, int location); static void mdio_write(struct net_device *dev, int phy_id, int loc, int val); static void epic_restart(struct net_device *dev); static void epic_timer(struct timer_list *t); -static void epic_tx_timeout(struct net_device *dev); +static void epic_tx_timeout(struct net_device *dev, unsigned int txqueue); static void epic_init_ring(struct net_device *dev); static netdev_tx_t epic_start_xmit(struct sk_buff *skb, struct net_device *dev); @@ -861,7 +861,7 @@ static void epic_timer(struct timer_list *t) add_timer(&ep->timer); } -static void epic_tx_timeout(struct net_device *dev) +static void epic_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct epic_private *ep = netdev_priv(dev); void __iomem *ioaddr = ep->ioaddr; diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c index 7b65e79d6ae9..186c0bddbe5f 100644 --- a/drivers/net/ethernet/smsc/smc911x.c +++ b/drivers/net/ethernet/smsc/smc911x.c @@ -1245,7 +1245,7 @@ static void smc911x_poll_controller(struct net_device *dev) #endif /* Our watchdog timed out. Called by the networking layer */ -static void smc911x_timeout(struct net_device *dev) +static void smc911x_timeout(struct net_device *dev, unsigned int txqueue) { struct smc911x_local *lp = netdev_priv(dev); int status, mask; diff --git a/drivers/net/ethernet/smsc/smc9194.c b/drivers/net/ethernet/smsc/smc9194.c index d3bb2ba51f40..4b2330deed47 100644 --- a/drivers/net/ethernet/smsc/smc9194.c +++ b/drivers/net/ethernet/smsc/smc9194.c @@ -216,7 +216,7 @@ static int smc_open(struct net_device *dev); /* . Our watchdog timed out. Called by the networking layer */ -static void smc_timeout(struct net_device *dev); +static void smc_timeout(struct net_device *dev, unsigned int txqueue); /* . This is called by the kernel in response to 'ifconfig ethX down'. It @@ -1094,7 +1094,7 @@ static int smc_open(struct net_device *dev) .-------------------------------------------------------- */ -static void smc_timeout(struct net_device *dev) +static void smc_timeout(struct net_device *dev, unsigned int txqueue) { /* If we get here, some higher level has decided we are broken. There should really be a "kick me" function call instead. */ diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c index a55f430f6a7b..f2a50eb3c1e0 100644 --- a/drivers/net/ethernet/smsc/smc91c92_cs.c +++ b/drivers/net/ethernet/smsc/smc91c92_cs.c @@ -271,7 +271,7 @@ static void smc91c92_release(struct pcmcia_device *link); static int smc_open(struct net_device *dev); static int smc_close(struct net_device *dev); static int smc_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -static void smc_tx_timeout(struct net_device *dev); +static void smc_tx_timeout(struct net_device *dev, unsigned int txqueue); static netdev_tx_t smc_start_xmit(struct sk_buff *skb, struct net_device *dev); static irqreturn_t smc_interrupt(int irq, void *dev_id); @@ -1178,7 +1178,7 @@ static void smc_hardware_send_packet(struct net_device * dev) /*====================================================================*/ -static void smc_tx_timeout(struct net_device *dev) +static void smc_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct smc_private *smc = netdev_priv(dev); unsigned int ioaddr = dev->base_addr; diff --git a/drivers/net/ethernet/smsc/smc91x.c b/drivers/net/ethernet/smsc/smc91x.c index 3a6761131f4c..90410f9d3b1a 100644 --- a/drivers/net/ethernet/smsc/smc91x.c +++ b/drivers/net/ethernet/smsc/smc91x.c @@ -1321,7 +1321,7 @@ static void smc_poll_controller(struct net_device *dev) #endif /* Our watchdog timed out. Called by the networking layer */ -static void smc_timeout(struct net_device *dev) +static void smc_timeout(struct net_device *dev, unsigned int txqueue) { struct smc_local *lp = netdev_priv(dev); void __iomem *ioaddr = lp->base; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index bbc65bd332a8..da80866d0371 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3792,7 +3792,7 @@ static int stmmac_napi_poll_tx(struct napi_struct *napi, int budget) * netdev structure and arrange for the device to be reset to a sane state * in order to transmit a new packet. */ -static void stmmac_tx_timeout(struct net_device *dev) +static void stmmac_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct stmmac_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index c91876f8c536..6ec9163e232c 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -2666,7 +2666,7 @@ static void cas_netpoll(struct net_device *dev) } #endif -static void cas_tx_timeout(struct net_device *dev) +static void cas_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct cas *cp = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index f5fd1f3c07cc..9a5004f674c7 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -6517,7 +6517,7 @@ static void niu_reset_task(struct work_struct *work) spin_unlock_irqrestore(&np->lock, flags); } -static void niu_tx_timeout(struct net_device *dev) +static void niu_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct niu *np = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index e9b757b03b56..c5add0b45eed 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -941,7 +941,7 @@ static int bigmac_close(struct net_device *dev) return 0; } -static void bigmac_tx_timeout(struct net_device *dev) +static void bigmac_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct bigmac *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index 3e7631160384..8358064fbd48 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -970,7 +970,7 @@ static void gem_poll_controller(struct net_device *dev) } #endif -static void gem_tx_timeout(struct net_device *dev) +static void gem_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct gem *gp = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index d007dfeba5c3..f0fe7bb2a750 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -2246,7 +2246,7 @@ static int happy_meal_close(struct net_device *dev) #define SXD(x) #endif -static void happy_meal_tx_timeout(struct net_device *dev) +static void happy_meal_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct happy_meal *hp = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c index 1468fa0a54e9..2102b95ec347 100644 --- a/drivers/net/ethernet/sun/sunqe.c +++ b/drivers/net/ethernet/sun/sunqe.c @@ -544,7 +544,7 @@ static void qe_tx_reclaim(struct sunqe *qep) qep->tx_old = elem; } -static void qe_tx_timeout(struct net_device *dev) +static void qe_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct sunqe *qep = netdev_priv(dev); int tx_full; diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c index 8b94d9ad9e2b..a601a306f9a5 100644 --- a/drivers/net/ethernet/sun/sunvnet_common.c +++ b/drivers/net/ethernet/sun/sunvnet_common.c @@ -1539,7 +1539,7 @@ out_dropped: } EXPORT_SYMBOL_GPL(sunvnet_start_xmit_common); -void sunvnet_tx_timeout_common(struct net_device *dev) +void sunvnet_tx_timeout_common(struct net_device *dev, unsigned int txqueue) { /* XXX Implement me XXX */ } diff --git a/drivers/net/ethernet/sun/sunvnet_common.h b/drivers/net/ethernet/sun/sunvnet_common.h index 2b808d2482d6..5416a3cb9e7d 100644 --- a/drivers/net/ethernet/sun/sunvnet_common.h +++ b/drivers/net/ethernet/sun/sunvnet_common.h @@ -135,7 +135,7 @@ int sunvnet_open_common(struct net_device *dev); int sunvnet_close_common(struct net_device *dev); void sunvnet_set_rx_mode_common(struct net_device *dev, struct vnet *vp); int sunvnet_set_mac_addr_common(struct net_device *dev, void *p); -void sunvnet_tx_timeout_common(struct net_device *dev); +void sunvnet_tx_timeout_common(struct net_device *dev, unsigned int txqueue); netdev_tx_t sunvnet_start_xmit_common(struct sk_buff *skb, struct net_device *dev, struct vnet_port *(*vnet_tx_port) diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c index a1f5a1e61040..07046a2370b3 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c @@ -689,7 +689,7 @@ static int xlgmac_close(struct net_device *netdev) return 0; } -static void xlgmac_tx_timeout(struct net_device *netdev) +static void xlgmac_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct xlgmac_pdata *pdata = netdev_priv(netdev); diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c index 3a655a4dc10e..5e1b8292cd3f 100644 --- a/drivers/net/ethernet/ti/cpmac.c +++ b/drivers/net/ethernet/ti/cpmac.c @@ -797,7 +797,7 @@ static irqreturn_t cpmac_irq(int irq, void *dev_id) return IRQ_HANDLED; } -static void cpmac_tx_timeout(struct net_device *dev) +static void cpmac_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct cpmac_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 707d5eb480ce..97a058ca60ac 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -272,7 +272,7 @@ void soft_reset(const char *module, void __iomem *reg) WARN(readl_relaxed(reg) & 1, "failed to soft-reset %s\n", module); } -void cpsw_ndo_tx_timeout(struct net_device *ndev) +void cpsw_ndo_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct cpsw_priv *priv = netdev_priv(ndev); struct cpsw_common *cpsw = priv->cpsw; diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h index bc726356a72c..b8d7b924ee3d 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.h +++ b/drivers/net/ethernet/ti/cpsw_priv.h @@ -449,7 +449,7 @@ int cpsw_rx_poll(struct napi_struct *napi_rx, int budget); void cpsw_rx_vlan_encap(struct sk_buff *skb); void soft_reset(const char *module, void __iomem *reg); void cpsw_set_slave_mac(struct cpsw_slave *slave, struct cpsw_priv *priv); -void cpsw_ndo_tx_timeout(struct net_device *ndev); +void cpsw_ndo_tx_timeout(struct net_device *ndev, unsigned int txqueue); int cpsw_need_resplit(struct cpsw_common *cpsw); int cpsw_ndo_ioctl(struct net_device *dev, struct ifreq *req, int cmd); int cpsw_ndo_set_tx_maxrate(struct net_device *ndev, int queue, u32 rate); diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index ae27be85e363..75d4e16c692b 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -983,7 +983,7 @@ fail_tx: * error and re-initialize the TX channel for hardware operation * */ -static void emac_dev_tx_timeout(struct net_device *ndev) +static void emac_dev_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct emac_priv *priv = netdev_priv(ndev); struct device *emac_dev = &ndev->dev; diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 1b2702f74455..432645e86495 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1811,7 +1811,7 @@ out: return (ret == 0) ? 0 : err; } -static void netcp_ndo_tx_timeout(struct net_device *ndev) +static void netcp_ndo_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct netcp_intf *netcp = netdev_priv(ndev); unsigned int descs = knav_pool_count(netcp->tx_pool); diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c index 78f0f2d59e22..ad465202980a 100644 --- a/drivers/net/ethernet/ti/tlan.c +++ b/drivers/net/ethernet/ti/tlan.c @@ -161,7 +161,7 @@ static void tlan_set_multicast_list(struct net_device *); static int tlan_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static int tlan_probe1(struct pci_dev *pdev, long ioaddr, int irq, int rev, const struct pci_device_id *ent); -static void tlan_tx_timeout(struct net_device *dev); +static void tlan_tx_timeout(struct net_device *dev, unsigned int txqueue); static void tlan_tx_timeout_work(struct work_struct *work); static int tlan_init_one(struct pci_dev *pdev, const struct pci_device_id *ent); @@ -997,7 +997,7 @@ static int tlan_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) * **************************************************************/ -static void tlan_tx_timeout(struct net_device *dev) +static void tlan_tx_timeout(struct net_device *dev, unsigned int txqueue) { TLAN_DBG(TLAN_DEBUG_GNRL, "%s: Transmit timed out.\n", dev->name); @@ -1028,7 +1028,7 @@ static void tlan_tx_timeout_work(struct work_struct *work) struct tlan_priv *priv = container_of(work, struct tlan_priv, tlan_tqueue); - tlan_tx_timeout(priv->dev); + tlan_tx_timeout(priv->dev, UINT_MAX); } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c index 9d9f8acb7ee3..070dd6fa9401 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c @@ -1405,7 +1405,7 @@ out: * * called, if tx hangs. Schedules a task that resets the interface */ -void gelic_net_tx_timeout(struct net_device *netdev) +void gelic_net_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct gelic_card *card; diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.h b/drivers/net/ethernet/toshiba/ps3_gelic_net.h index 051033580f0a..805903dbddcc 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.h @@ -359,7 +359,7 @@ int gelic_net_open(struct net_device *netdev); int gelic_net_stop(struct net_device *netdev); netdev_tx_t gelic_net_xmit(struct sk_buff *skb, struct net_device *netdev); void gelic_net_set_multi(struct net_device *netdev); -void gelic_net_tx_timeout(struct net_device *netdev); +void gelic_net_tx_timeout(struct net_device *netdev, unsigned int txqueue); int gelic_net_setup_netdev(struct net_device *netdev, struct gelic_card *card); /* shared ethtool ops */ diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c index 538e70810d3d..6576271642c1 100644 --- a/drivers/net/ethernet/toshiba/spider_net.c +++ b/drivers/net/ethernet/toshiba/spider_net.c @@ -2180,7 +2180,7 @@ out: * called, if tx hangs. Schedules a task that resets the interface */ static void -spider_net_tx_timeout(struct net_device *netdev) +spider_net_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct spider_net_card *card; diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c index 12466a72cefc..708de826200e 100644 --- a/drivers/net/ethernet/toshiba/tc35815.c +++ b/drivers/net/ethernet/toshiba/tc35815.c @@ -483,7 +483,7 @@ static void tc35815_txdone(struct net_device *dev); static int tc35815_close(struct net_device *dev); static struct net_device_stats *tc35815_get_stats(struct net_device *dev); static void tc35815_set_multicast_list(struct net_device *dev); -static void tc35815_tx_timeout(struct net_device *dev); +static void tc35815_tx_timeout(struct net_device *dev, unsigned int txqueue); static int tc35815_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); #ifdef CONFIG_NET_POLL_CONTROLLER static void tc35815_poll_controller(struct net_device *dev); @@ -1189,7 +1189,7 @@ static void tc35815_schedule_restart(struct net_device *dev) spin_unlock_irqrestore(&lp->lock, flags); } -static void tc35815_tx_timeout(struct net_device *dev) +static void tc35815_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct tc35815_regs __iomem *tr = (struct tc35815_regs __iomem *)dev->base_addr; diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index ed12dbd156f0..803247d51fe9 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -506,7 +506,7 @@ static void mdio_write(struct net_device *dev, int phy_id, int location, int val static int rhine_open(struct net_device *dev); static void rhine_reset_task(struct work_struct *work); static void rhine_slow_event_task(struct work_struct *work); -static void rhine_tx_timeout(struct net_device *dev); +static void rhine_tx_timeout(struct net_device *dev, unsigned int txqueue); static netdev_tx_t rhine_start_tx(struct sk_buff *skb, struct net_device *dev); static irqreturn_t rhine_interrupt(int irq, void *dev_instance); @@ -1761,7 +1761,7 @@ out_unlock: mutex_unlock(&rp->task_lock); } -static void rhine_tx_timeout(struct net_device *dev) +static void rhine_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct rhine_private *rp = netdev_priv(dev); void __iomem *ioaddr = rp->base; diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index bede1ff289c5..c0d181a7f83a 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -790,7 +790,7 @@ static void w5100_restart_work(struct work_struct *work) w5100_restart(priv->ndev); } -static void w5100_tx_timeout(struct net_device *ndev) +static void w5100_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct w5100_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/wiznet/w5300.c b/drivers/net/ethernet/wiznet/w5300.c index 6ba2747779ce..46aae30c4636 100644 --- a/drivers/net/ethernet/wiznet/w5300.c +++ b/drivers/net/ethernet/wiznet/w5300.c @@ -341,7 +341,7 @@ static void w5300_get_regs(struct net_device *ndev, } } -static void w5300_tx_timeout(struct net_device *ndev) +static void w5300_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct w5300_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 0de52e70abcc..0c26f5bcc523 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -521,7 +521,7 @@ static int xemaclite_set_mac_address(struct net_device *dev, void *address) * * This function is called when Tx time out occurs for Emaclite device. */ -static void xemaclite_tx_timeout(struct net_device *dev) +static void xemaclite_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct net_local *lp = netdev_priv(dev); unsigned long flags; diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c index fd5288ff53b5..480ab7251515 100644 --- a/drivers/net/ethernet/xircom/xirc2ps_cs.c +++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c @@ -288,7 +288,7 @@ struct local_info { */ static netdev_tx_t do_start_xmit(struct sk_buff *skb, struct net_device *dev); -static void xirc_tx_timeout(struct net_device *dev); +static void xirc_tx_timeout(struct net_device *dev, unsigned int txqueue); static void xirc2ps_tx_timeout_task(struct work_struct *work); static void set_addresses(struct net_device *dev); static void set_multicast_list(struct net_device *dev); @@ -1203,7 +1203,7 @@ xirc2ps_tx_timeout_task(struct work_struct *work) } static void -xirc_tx_timeout(struct net_device *dev) +xirc_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct local_info *lp = netdev_priv(dev); dev->stats.tx_errors++; diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index b517c1af9de0..309a74da2ec3 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -48,7 +48,7 @@ static void fjes_get_stats64(struct net_device *, struct rtnl_link_stats64 *); static int fjes_change_mtu(struct net_device *, int); static int fjes_vlan_rx_add_vid(struct net_device *, __be16 proto, u16); static int fjes_vlan_rx_kill_vid(struct net_device *, __be16 proto, u16); -static void fjes_tx_retry(struct net_device *); +static void fjes_tx_retry(struct net_device *, unsigned int txqueue); static int fjes_acpi_add(struct acpi_device *); static int fjes_acpi_remove(struct acpi_device *); @@ -792,7 +792,7 @@ fjes_xmit_frame(struct sk_buff *skb, struct net_device *netdev) return ret; } -static void fjes_tx_retry(struct net_device *netdev) +static void fjes_tx_retry(struct net_device *netdev, unsigned int txqueue) { struct netdev_queue *queue = netdev_get_tx_queue(netdev, 0); diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index 2a91c192659f..317d3a8df316 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -457,7 +457,7 @@ static void slip_write_wakeup(struct tty_struct *tty) schedule_work(&sl->tx_work); } -static void sl_tx_timeout(struct net_device *dev) +static void sl_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct slip *sl = netdev_priv(dev); diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c index 1e58702c737f..d387bc7ac1b6 100644 --- a/drivers/net/usb/catc.c +++ b/drivers/net/usb/catc.c @@ -447,7 +447,7 @@ static netdev_tx_t catc_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } -static void catc_tx_timeout(struct net_device *netdev) +static void catc_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct catc *catc = netdev_priv(netdev); diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index ca827802f291..417e42c9fd03 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -820,7 +820,7 @@ static const struct ethtool_ops ops = { }; /* called when a packet did not ack after watchdogtimeout */ -static void hso_net_tx_timeout(struct net_device *net) +static void hso_net_tx_timeout(struct net_device *net, unsigned int txqueue) { struct hso_net *odev = netdev_priv(net); diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 8c01fbf68a89..c792d65dd7b4 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -400,7 +400,7 @@ static int ipheth_tx(struct sk_buff *skb, struct net_device *net) return NETDEV_TX_OK; } -static void ipheth_tx_timeout(struct net_device *net) +static void ipheth_tx_timeout(struct net_device *net, unsigned int txqueue) { struct ipheth_device *dev = netdev_priv(net); diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c index 8e210ba4a313..ed01dc964c99 100644 --- a/drivers/net/usb/kaweth.c +++ b/drivers/net/usb/kaweth.c @@ -894,7 +894,7 @@ static void kaweth_async_set_rx_mode(struct kaweth_device *kaweth) /**************************************************************** * kaweth_tx_timeout ****************************************************************/ -static void kaweth_tx_timeout(struct net_device *net) +static void kaweth_tx_timeout(struct net_device *net, unsigned int txqueue) { struct kaweth_device *kaweth = netdev_priv(net); diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index cf1f3f0a4b9b..0c8b9363366b 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -3662,7 +3662,7 @@ static void lan78xx_disconnect(struct usb_interface *intf) usb_put_dev(udev); } -static void lan78xx_tx_timeout(struct net_device *net) +static void lan78xx_tx_timeout(struct net_device *net, unsigned int txqueue) { struct lan78xx_net *dev = netdev_priv(net); diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c index f7d117d80cfb..8783e2ab3ec0 100644 --- a/drivers/net/usb/pegasus.c +++ b/drivers/net/usb/pegasus.c @@ -693,7 +693,7 @@ static void intr_callback(struct urb *urb) "can't resubmit interrupt urb, %d\n", res); } -static void pegasus_tx_timeout(struct net_device *net) +static void pegasus_tx_timeout(struct net_device *net, unsigned int txqueue) { pegasus_t *pegasus = netdev_priv(net); netif_warn(pegasus, timer, net, "tx timeout\n"); diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index c5ebf35d2488..9ec1da429514 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -2507,7 +2507,7 @@ static void rtl_drop_queued_tx(struct r8152 *tp) } } -static void rtl8152_tx_timeout(struct net_device *netdev) +static void rtl8152_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct r8152 *tp = netdev_priv(netdev); diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 13e51ccf0214..e7c630d37589 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -655,7 +655,7 @@ static void disable_net_traffic(rtl8150_t * dev) set_registers(dev, CR, 1, &cr); } -static void rtl8150_tx_timeout(struct net_device *netdev) +static void rtl8150_tx_timeout(struct net_device *netdev, unsigned int txqueue) { rtl8150_t *dev = netdev_priv(netdev); dev_warn(&netdev->dev, "Tx timeout.\n"); diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 30e511c2c8d0..bc88923db369 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1293,7 +1293,7 @@ static void tx_complete (struct urb *urb) /*-------------------------------------------------------------------------*/ -void usbnet_tx_timeout (struct net_device *net) +void usbnet_tx_timeout (struct net_device *net, unsigned int txqueue) { struct usbnet *dev = netdev_priv(net); diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 216acf37ca7c..18f152fa0068 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -3198,7 +3198,7 @@ vmxnet3_free_intr_resources(struct vmxnet3_adapter *adapter) static void -vmxnet3_tx_timeout(struct net_device *netdev) +vmxnet3_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct vmxnet3_adapter *adapter = netdev_priv(netdev); adapter->tx_timeout_count++; diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index af539151d663..5d6532ad6b78 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -268,7 +268,7 @@ static int cosa_net_attach(struct net_device *dev, unsigned short encoding, unsigned short parity); static int cosa_net_open(struct net_device *d); static int cosa_net_close(struct net_device *d); -static void cosa_net_timeout(struct net_device *d); +static void cosa_net_timeout(struct net_device *d, unsigned int txqueue); static netdev_tx_t cosa_net_tx(struct sk_buff *skb, struct net_device *d); static char *cosa_net_setup_rx(struct channel_data *channel, int size); static int cosa_net_rx_done(struct channel_data *channel); @@ -670,7 +670,7 @@ static netdev_tx_t cosa_net_tx(struct sk_buff *skb, return NETDEV_TX_OK; } -static void cosa_net_timeout(struct net_device *dev) +static void cosa_net_timeout(struct net_device *dev, unsigned int txqueue) { struct channel_data *chan = dev_to_chan(dev); diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 1901ec7948d8..7916efce7188 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -2239,7 +2239,7 @@ fst_attach(struct net_device *dev, unsigned short encoding, unsigned short parit } static void -fst_tx_timeout(struct net_device *dev) +fst_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct fst_port_info *port; struct fst_card_info *card; diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index ca0f3be2b6bf..308384756e6f 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1039,7 +1039,7 @@ static const struct dev_pm_ops uhdlc_pm_ops = { #define HDLC_PM_OPS NULL #endif -static void uhdlc_tx_timeout(struct net_device *ndev) +static void uhdlc_tx_timeout(struct net_device *ndev, unsigned int txqueue) { netdev_err(ndev, "%s\n", __func__); } diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c index 0e6a51525d91..a20f467ca48a 100644 --- a/drivers/net/wan/lmc/lmc_main.c +++ b/drivers/net/wan/lmc/lmc_main.c @@ -99,7 +99,7 @@ static int lmc_ifdown(struct net_device * const); static void lmc_watchdog(struct timer_list *t); static void lmc_reset(lmc_softc_t * const sc); static void lmc_dec_reset(lmc_softc_t * const sc); -static void lmc_driver_timeout(struct net_device *dev); +static void lmc_driver_timeout(struct net_device *dev, unsigned int txqueue); /* * linux reserves 16 device specific IOCTLs. We call them @@ -2044,7 +2044,7 @@ static void lmc_initcsrs(lmc_softc_t * const sc, lmc_csrptr_t csr_base, /*fold00 lmc_trace(sc->lmc_device, "lmc_initcsrs out"); } -static void lmc_driver_timeout(struct net_device *dev) +static void lmc_driver_timeout(struct net_device *dev, unsigned int txqueue) { lmc_softc_t *sc = dev_to_sc(dev); u32 csr6; diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c index 914be5847386..69773d228ec1 100644 --- a/drivers/net/wan/x25_asy.c +++ b/drivers/net/wan/x25_asy.c @@ -276,7 +276,7 @@ static void x25_asy_write_wakeup(struct tty_struct *tty) sl->xhead += actual; } -static void x25_asy_timeout(struct net_device *dev) +static void x25_asy_timeout(struct net_device *dev, unsigned int txqueue) { struct x25_asy *sl = netdev_priv(dev); diff --git a/drivers/net/wimax/i2400m/netdev.c b/drivers/net/wimax/i2400m/netdev.c index a5db3c06b646..a7fcbceb6e6b 100644 --- a/drivers/net/wimax/i2400m/netdev.c +++ b/drivers/net/wimax/i2400m/netdev.c @@ -380,7 +380,7 @@ drop: static -void i2400m_tx_timeout(struct net_device *net_dev) +void i2400m_tx_timeout(struct net_device *net_dev, unsigned int txqueue) { /* * We might want to kick the device diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c index c4c83ab60cbc..25d7fd6e54bf 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c @@ -5833,7 +5833,7 @@ static int ipw2100_close(struct net_device *dev) /* * TODO: Fix this function... its just wrong */ -static void ipw2100_tx_timeout(struct net_device *dev) +static void ipw2100_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct ipw2100_priv *priv = libipw_priv(dev); diff --git a/drivers/net/wireless/intersil/hostap/hostap_main.c b/drivers/net/wireless/intersil/hostap/hostap_main.c index 05466281afb6..de97b3304115 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_main.c +++ b/drivers/net/wireless/intersil/hostap/hostap_main.c @@ -761,7 +761,7 @@ static void hostap_set_multicast_list(struct net_device *dev) } -static void prism2_tx_timeout(struct net_device *dev) +static void prism2_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct hostap_interface *iface; local_info_t *local; diff --git a/drivers/net/wireless/intersil/orinoco/main.c b/drivers/net/wireless/intersil/orinoco/main.c index 28dac36d7c4c..00264a14e52c 100644 --- a/drivers/net/wireless/intersil/orinoco/main.c +++ b/drivers/net/wireless/intersil/orinoco/main.c @@ -647,7 +647,7 @@ static void __orinoco_ev_txexc(struct net_device *dev, struct hermes *hw) netif_wake_queue(dev); } -void orinoco_tx_timeout(struct net_device *dev) +void orinoco_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct orinoco_private *priv = ndev_priv(dev); struct net_device_stats *stats = &dev->stats; diff --git a/drivers/net/wireless/intersil/orinoco/orinoco.h b/drivers/net/wireless/intersil/orinoco/orinoco.h index 430862a6a24b..cdd026af100b 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco.h +++ b/drivers/net/wireless/intersil/orinoco/orinoco.h @@ -207,7 +207,7 @@ int orinoco_open(struct net_device *dev); int orinoco_stop(struct net_device *dev); void orinoco_set_multicast_list(struct net_device *dev); int orinoco_change_mtu(struct net_device *dev, int new_mtu); -void orinoco_tx_timeout(struct net_device *dev); +void orinoco_tx_timeout(struct net_device *dev, unsigned int txqueue); /********************************************************************/ /* Locking and synchronization functions */ diff --git a/drivers/net/wireless/intersil/prism54/islpci_eth.c b/drivers/net/wireless/intersil/prism54/islpci_eth.c index 2b8fb07d07e7..8d680250a281 100644 --- a/drivers/net/wireless/intersil/prism54/islpci_eth.c +++ b/drivers/net/wireless/intersil/prism54/islpci_eth.c @@ -473,7 +473,7 @@ islpci_do_reset_and_wake(struct work_struct *work) } void -islpci_eth_tx_timeout(struct net_device *ndev) +islpci_eth_tx_timeout(struct net_device *ndev, unsigned int txqueue) { islpci_private *priv = netdev_priv(ndev); diff --git a/drivers/net/wireless/intersil/prism54/islpci_eth.h b/drivers/net/wireless/intersil/prism54/islpci_eth.h index 61f4b43c6054..e433ccdc526b 100644 --- a/drivers/net/wireless/intersil/prism54/islpci_eth.h +++ b/drivers/net/wireless/intersil/prism54/islpci_eth.h @@ -53,7 +53,7 @@ struct avs_80211_1_header { void islpci_eth_cleanup_transmit(islpci_private *, isl38xx_control_block *); netdev_tx_t islpci_eth_transmit(struct sk_buff *, struct net_device *); int islpci_eth_receive(islpci_private *); -void islpci_eth_tx_timeout(struct net_device *); +void islpci_eth_tx_timeout(struct net_device *, unsigned int txqueue); void islpci_do_reset_and_wake(struct work_struct *); #endif /* _ISL_GEN_H */ diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index d14e55e3c9da..7d94695e7961 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1020,7 +1020,7 @@ static void mwifiex_set_multicast_list(struct net_device *dev) * CFG802.11 network device handler for transmission timeout. */ static void -mwifiex_tx_timeout(struct net_device *dev) +mwifiex_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c index 5fb598389487..648dfc38bd70 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.c +++ b/drivers/net/wireless/quantenna/qtnfmac/core.c @@ -156,7 +156,7 @@ static void qtnf_netdev_get_stats64(struct net_device *ndev, /* Netdev handler for transmission timeout. */ -static void qtnf_netdev_tx_timeout(struct net_device *ndev) +static void qtnf_netdev_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct qtnf_vif *vif = qtnf_netdev_get_priv(ndev); struct qtnf_wmac *mac; diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c index 007bf6803293..686161db8706 100644 --- a/drivers/net/wireless/wl3501_cs.c +++ b/drivers/net/wireless/wl3501_cs.c @@ -1285,7 +1285,7 @@ out: return rc; } -static void wl3501_tx_timeout(struct net_device *dev) +static void wl3501_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct net_device_stats *stats = &dev->stats; int rc; diff --git a/drivers/net/wireless/zydas/zd1201.c b/drivers/net/wireless/zydas/zd1201.c index 0db7362bedb4..41641fc2be74 100644 --- a/drivers/net/wireless/zydas/zd1201.c +++ b/drivers/net/wireless/zydas/zd1201.c @@ -830,7 +830,7 @@ static netdev_tx_t zd1201_hard_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } -static void zd1201_tx_timeout(struct net_device *dev) +static void zd1201_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct zd1201 *zd = netdev_priv(dev); diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 871d44746f5c..a23875a4fa2b 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -1076,7 +1076,7 @@ void qeth_clear_working_pool_list(struct qeth_card *); void qeth_drain_output_queues(struct qeth_card *card); void qeth_setadp_promisc_mode(struct qeth_card *card, bool enable); int qeth_setadpparms_change_macaddr(struct qeth_card *); -void qeth_tx_timeout(struct net_device *); +void qeth_tx_timeout(struct net_device *, unsigned int txqueue); void qeth_prepare_ipa_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob, u16 cmd_length); int qeth_query_switch_attributes(struct qeth_card *card, diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index b9a2349e4b90..ce7ff1abbef3 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -4325,7 +4325,7 @@ int qeth_set_access_ctrl_online(struct qeth_card *card, int fallback) return rc; } -void qeth_tx_timeout(struct net_device *dev) +void qeth_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct qeth_card *card; diff --git a/drivers/staging/ks7010/ks_wlan_net.c b/drivers/staging/ks7010/ks_wlan_net.c index 3cffc8be6656..211dd4a11cac 100644 --- a/drivers/staging/ks7010/ks_wlan_net.c +++ b/drivers/staging/ks7010/ks_wlan_net.c @@ -45,7 +45,7 @@ struct wep_key { * function prototypes */ static int ks_wlan_open(struct net_device *dev); -static void ks_wlan_tx_timeout(struct net_device *dev); +static void ks_wlan_tx_timeout(struct net_device *dev, unsigned int txqueue); static int ks_wlan_start_xmit(struct sk_buff *skb, struct net_device *dev); static int ks_wlan_close(struct net_device *dev); static void ks_wlan_set_rx_mode(struct net_device *dev); @@ -2498,7 +2498,7 @@ int ks_wlan_set_mac_address(struct net_device *dev, void *addr) } static -void ks_wlan_tx_timeout(struct net_device *dev) +void ks_wlan_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct ks_wlan_private *priv = netdev_priv(dev); diff --git a/drivers/staging/qlge/qlge_main.c b/drivers/staging/qlge/qlge_main.c index 6ad4515311f7..24d20f000435 100644 --- a/drivers/staging/qlge/qlge_main.c +++ b/drivers/staging/qlge/qlge_main.c @@ -4274,7 +4274,7 @@ static int qlge_set_mac_address(struct net_device *ndev, void *p) return status; } -static void qlge_tx_timeout(struct net_device *ndev) +static void qlge_tx_timeout(struct net_device *ndev, unsigned int txqueue) { struct ql_adapter *qdev = netdev_priv(ndev); ql_queue_asic_error(qdev); diff --git a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c index dace81a7d1ba..a51d627284d1 100644 --- a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c +++ b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c @@ -267,7 +267,7 @@ static short _rtl92e_check_nic_enough_desc(struct net_device *dev, int prio) return 0; } -static void _rtl92e_tx_timeout(struct net_device *dev) +static void _rtl92e_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct r8192_priv *priv = rtllib_priv(dev); diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c index 7e2cabd16e88..482382a887f8 100644 --- a/drivers/staging/rtl8192u/r8192U_core.c +++ b/drivers/staging/rtl8192u/r8192U_core.c @@ -640,7 +640,7 @@ short check_nic_enough_desc(struct net_device *dev, int queue_index) return (used < MAX_TX_URB); } -static void tx_timeout(struct net_device *dev) +static void tx_timeout(struct net_device *dev, unsigned int txqueue) { struct r8192_priv *priv = ieee80211_priv(dev); diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c index 1d1440d43002..0433536930a9 100644 --- a/drivers/staging/unisys/visornic/visornic_main.c +++ b/drivers/staging/unisys/visornic/visornic_main.c @@ -1078,7 +1078,7 @@ out_save_flags: * Queue the work and return. Make sure we have not already been informed that * the IO Partition is gone; if so, we will have already timed-out the xmits. */ -static void visornic_xmit_timeout(struct net_device *netdev) +static void visornic_xmit_timeout(struct net_device *netdev, unsigned int txqueue) { struct visornic_devdata *devdata = netdev_priv(netdev); unsigned long flags; diff --git a/drivers/staging/wlan-ng/p80211netdev.c b/drivers/staging/wlan-ng/p80211netdev.c index a70fb84f38f1..b809c0015c0c 100644 --- a/drivers/staging/wlan-ng/p80211netdev.c +++ b/drivers/staging/wlan-ng/p80211netdev.c @@ -101,7 +101,7 @@ static void p80211knetdev_set_multicast_list(struct net_device *dev); static int p80211knetdev_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static int p80211knetdev_set_mac_address(struct net_device *dev, void *addr); -static void p80211knetdev_tx_timeout(struct net_device *netdev); +static void p80211knetdev_tx_timeout(struct net_device *netdev, unsigned int txqueue); static int p80211_rx_typedrop(struct wlandevice *wlandev, u16 fc); int wlan_watchdog = 5000; @@ -1074,7 +1074,7 @@ static int p80211_rx_typedrop(struct wlandevice *wlandev, u16 fc) return drop; } -static void p80211knetdev_tx_timeout(struct net_device *netdev) +static void p80211knetdev_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct wlandevice *wlandev = netdev->ml_priv; diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 36a3eb4ad4c5..f1c90fa2978e 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -2704,7 +2704,7 @@ static netdev_tx_t gsm_mux_net_start_xmit(struct sk_buff *skb, } /* called when a packet did not ack after watchdogtimeout */ -static void gsm_mux_net_tx_timeout(struct net_device *net) +static void gsm_mux_net_tx_timeout(struct net_device *net, unsigned int txqueue) { /* Tell syslog we are hosed. */ dev_dbg(&net->dev, "Tx timed out.\n"); diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c index 84f26e43b229..61dc6b4a43d0 100644 --- a/drivers/tty/synclink.c +++ b/drivers/tty/synclink.c @@ -7837,7 +7837,7 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * * dev pointer to network device structure */ -static void hdlcdev_tx_timeout(struct net_device *dev) +static void hdlcdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct mgsl_struct *info = dev_to_port(dev); unsigned long flags; diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index e8a9047de451..5d59e2369c8a 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -1682,7 +1682,7 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * * dev pointer to network device structure */ -static void hdlcdev_tx_timeout(struct net_device *dev) +static void hdlcdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct slgt_info *info = dev_to_port(dev); unsigned long flags; diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c index fcb91bf7a15b..33181fa6eb18 100644 --- a/drivers/tty/synclinkmp.c +++ b/drivers/tty/synclinkmp.c @@ -1807,7 +1807,7 @@ static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * * dev pointer to network device structure */ -static void hdlcdev_tx_timeout(struct net_device *dev) +static void hdlcdev_tx_timeout(struct net_device *dev, unsigned int txqueue) { SLMP_INFO *info = dev_to_port(dev); unsigned long flags; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9ef20389622d..30745068fb39 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1014,7 +1014,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); * Called when a user wants to change the Maximum Transfer Unit * of a device. * - * void (*ndo_tx_timeout)(struct net_device *dev); + * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue); * Callback used when the transmitter has not made any progress * for dev->watchdog ticks. * @@ -1281,7 +1281,8 @@ struct net_device_ops { int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); - void (*ndo_tx_timeout) (struct net_device *dev); + void (*ndo_tx_timeout) (struct net_device *dev, + unsigned int txqueue); void (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index d8860f2d0976..b0bff3083278 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -253,7 +253,7 @@ extern int usbnet_open(struct net_device *net); extern int usbnet_stop(struct net_device *net); extern netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, struct net_device *net); -extern void usbnet_tx_timeout(struct net_device *net); +extern void usbnet_tx_timeout(struct net_device *net, unsigned int txqueue); extern int usbnet_change_mtu(struct net_device *net, int new_mtu); extern int usbnet_get_endpoints(struct usbnet *, struct usb_interface *); diff --git a/net/atm/lec.c b/net/atm/lec.c index 5a77c235a212..b57368e70aab 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) dev->stats.tx_bytes += skb->len; } -static void lec_tx_timeout(struct net_device *dev) +static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) { pr_info("%s\n", dev->name); netif_trans_update(dev); diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 1d4d7d415730..cc1cff63194f 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -112,7 +112,7 @@ static int bnep_net_set_mac_addr(struct net_device *dev, void *arg) return 0; } -static void bnep_net_timeout(struct net_device *dev) +static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue) { BT_DBG("net_timeout"); netif_wake_queue(dev); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5ab696efca95..6c9595f1048a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -441,7 +441,7 @@ static void dev_watchdog(struct timer_list *t) trace_net_dev_xmit_timeout(dev, i); WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", dev->name, netdev_drivername(dev), i); - dev->netdev_ops->ndo_tx_timeout(dev); + dev->netdev_ops->ndo_tx_timeout(dev, i); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + -- cgit v1.2.3 From 7e6897f95935973c3253fd756135b5ea58043dc8 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 13 Dec 2019 18:51:09 +0100 Subject: bpf, xdp: Start using the BPF dispatcher for XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a BPF dispatcher for XDP. The dispatcher is updated from the XDP control-path, dev_xdp_install(), and used when an XDP program is run via bpf_prog_run_xdp(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-4-bjorn.topel@gmail.com --- include/linux/bpf.h | 15 +++++++++++++++ include/linux/filter.h | 40 ++++++++++++++++++++++++---------------- kernel/bpf/syscall.c | 26 ++++++++++++++++++-------- net/core/dev.c | 19 ++++++++++++++++++- net/core/filter.c | 8 ++++++++ 5 files changed, 83 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 53ae4a50abe4..5970989b99d1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,14 @@ struct bpf_dispatcher { u32 image_off; }; +static __always_inline unsigned int bpf_dispatcher_nopfunc( + const void *ctx, + const struct bpf_insn *insnsi, + unsigned int (*bpf_func)(const void *, + const struct bpf_insn *)) +{ + return bpf_func(ctx, insnsi); +} #ifdef CONFIG_BPF_JIT struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); @@ -997,6 +1005,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog); +struct bpf_prog *bpf_prog_by_id(u32 id); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1128,6 +1138,11 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, static inline void bpf_map_put(struct bpf_map *map) { } + +static inline struct bpf_prog *bpf_prog_by_id(u32 id) +{ + return ERR_PTR(-ENOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, diff --git a/include/linux/filter.h b/include/linux/filter.h index a141cb07e76a..37ac7025031d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,23 +559,26 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); -#define BPF_PROG_RUN(prog, ctx) ({ \ - u32 ret; \ - cant_sleep(); \ - if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ - } else { \ - ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ - } \ +#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ + u32 ret; \ + cant_sleep(); \ + if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ + struct bpf_prog_stats *stats; \ + u64 start = sched_clock(); \ + ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&stats->syncp); \ + stats->cnt++; \ + stats->nsecs += sched_clock() - start; \ + u64_stats_update_end(&stats->syncp); \ + } else { \ + ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + } \ ret; }) +#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \ + bpf_dispatcher_nopfunc) + #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { @@ -699,6 +702,8 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, return res; } +DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp) + static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { @@ -708,9 +713,12 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * already takes rcu_read_lock() when fetching the program, so * it's not necessary here anymore. */ - return BPF_PROG_RUN(prog, xdp); + return __BPF_PROG_RUN(prog, xdp, + BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp)); } +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); + static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 66b90eaf99fe..b08c362f4e02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2338,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id -static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; - u32 id = attr->prog_id; - int fd; - - if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); @@ -2357,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); + return prog; +} + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/net/core/dev.c b/net/core/dev.c index 2c277b8aba38..255d3cf35360 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8542,7 +8542,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { + bool non_hw = !(flags & XDP_FLAGS_HW_MODE); + struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; + int err; + + if (non_hw) { + prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, + XDP_QUERY_PROG)); + if (IS_ERR(prev_prog)) + prev_prog = NULL; + } memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -8553,7 +8563,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, xdp.flags = flags; xdp.prog = prog; - return bpf_op(dev, &xdp); + err = bpf_op(dev, &xdp); + if (!err && non_hw) + bpf_prog_change_xdp(prev_prog, prog); + + if (prev_prog) + bpf_prog_put(prev_prog); + + return err; } static void dev_xdp_uninstall(struct net_device *dev) diff --git a/net/core/filter.c b/net/core/filter.c index f1e703eed3d2..a411f7835dee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8940,3 +8940,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ + +DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp) + +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) +{ + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp), + prev_prog, prog); +} -- cgit v1.2.3 From f23c4b3924d2e9382820ee677b68d42d5dd7b08b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 13 Dec 2019 18:51:10 +0100 Subject: bpf: Start using the BPF dispatcher in BPF_TEST_RUN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to properly exercise the BPF dispatcher, this commit adds BPF dispatcher usage to BPF_TEST_RUN when executing XDP programs. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191213175112.30208-5-bjorn.topel@gmail.com --- net/bpf/test_run.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 85c8cbbada92..5016c538d3ce 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,7 +15,7 @@ #include static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, - u32 *retval, u32 *time) + u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; enum bpf_cgroup_storage_type stype; @@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { bpf_cgroup_storage_set(storage); - *retval = BPF_PROG_RUN(prog, ctx); + + if (xdp) + *retval = bpf_prog_run_xdp(prog, ctx); + else + *retval = BPF_PROG_RUN(prog, ctx); if (signal_pending(current)) { ret = -EINTR; @@ -356,7 +360,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; - ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { @@ -413,8 +417,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - - ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + bpf_prog_change_xdp(NULL, prog); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || @@ -422,6 +426,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: + bpf_prog_change_xdp(prog, NULL); kfree(data); return ret; } -- cgit v1.2.3 From 850a88cc4096fe1df407452ba2e4d28cf5b3eee9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Dec 2019 14:30:27 -0800 Subject: bpf: Expose __sk_buff wire_len/gso_segs to BPF_PROG_TEST_RUN wire_len should not be less than real len and is capped by GSO_MAX_SIZE. gso_segs is capped by GSO_MAX_SEGS. v2: * set wire_len to skb->len when passed wire_len is 0 (Alexei Starovoitov) Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191213223028.161282-1-sdf@google.com --- net/bpf/test_run.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 5016c538d3ce..93a9c87787e0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -267,8 +267,10 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return -EINVAL; /* tstamp is allowed */ + /* wire_len is allowed */ + /* gso_segs is allowed */ - if (!range_is_zero(__skb, offsetofend(struct __sk_buff, tstamp), + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), sizeof(struct __sk_buff))) return -EINVAL; @@ -276,6 +278,19 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + if (__skb->wire_len == 0) { + cb->pkt_len = skb->len; + } else { + if (__skb->wire_len < skb->len || + __skb->wire_len > GSO_MAX_SIZE) + return -EINVAL; + cb->pkt_len = __skb->wire_len; + } + + if (__skb->gso_segs > GSO_MAX_SEGS) + return -EINVAL; + skb_shinfo(skb)->gso_segs = __skb->gso_segs; + return 0; } @@ -289,6 +304,8 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->priority = skb->priority; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); + __skb->wire_len = cb->pkt_len; + __skb->gso_segs = skb_shinfo(skb)->gso_segs; } int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, -- cgit v1.2.3 From de1799667b002331609e8ee6b924ccfd51968d99 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 11 Dec 2019 20:07:10 -0500 Subject: net: bridge: add STP xstats This adds rx_bpdu, tx_bpdu, rx_tcn, tx_tcn, transition_blk, transition_fwd xstats counters to the bridge ports copied over via netlink, providing useful information for STP. Signed-off-by: Vivien Didelot Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 10 ++++++++++ net/bridge/br_netlink.c | 13 +++++++++++++ net/bridge/br_private.h | 2 ++ net/bridge/br_stp.c | 15 +++++++++++++++ net/bridge/br_stp_bpdu.c | 4 ++++ 5 files changed, 44 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 1b3c2b643a02..4a58e3d7de46 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -156,6 +156,15 @@ struct bridge_vlan_xstats { __u32 pad2; }; +struct bridge_stp_xstats { + __u64 transition_blk; + __u64 transition_fwd; + __u64 rx_bpdu; + __u64 tx_bpdu; + __u64 rx_tcn; + __u64 tx_tcn; +}; + /* Bridge multicast database attributes * [MDBA_MDB] = { * [MDBA_MDB_ENTRY] = { @@ -262,6 +271,7 @@ enum { BRIDGE_XSTATS_VLAN, BRIDGE_XSTATS_MCAST, BRIDGE_XSTATS_PAD, + BRIDGE_XSTATS_STP, __BRIDGE_XSTATS_MAX }; #define BRIDGE_XSTATS_MAX (__BRIDGE_XSTATS_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index a0a54482aabc..60136575aea4 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1607,6 +1607,19 @@ static int br_fill_linkxstats(struct sk_buff *skb, br_multicast_get_stats(br, p, nla_data(nla)); } #endif + + if (p) { + nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, + sizeof(p->stp_xstats), + BRIDGE_XSTATS_PAD); + if (!nla) + goto nla_put_failure; + + spin_lock_bh(&br->lock); + memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); + spin_unlock_bh(&br->lock); + } + nla_nest_end(skb, nest); *prividx = 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 36b0367ca1e0..f540f3bdf294 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -283,6 +283,8 @@ struct net_bridge_port { #endif u16 group_fwd_mask; u16 backup_redirected_cnt; + + struct bridge_stp_xstats stp_xstats; }; #define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f1410f8d312..6856a6d9282b 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -45,6 +45,17 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) br_info(p->br, "port %u(%s) entered %s state\n", (unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]); + + if (p->br->stp_enabled == BR_KERNEL_STP) { + switch (p->state) { + case BR_STATE_BLOCKING: + p->stp_xstats.transition_blk++; + break; + case BR_STATE_FORWARDING: + p->stp_xstats.transition_fwd++; + break; + } + } } /* called under bridge lock */ @@ -484,6 +495,8 @@ void br_received_config_bpdu(struct net_bridge_port *p, struct net_bridge *br; int was_root; + p->stp_xstats.rx_bpdu++; + br = p->br; was_root = br_is_root_bridge(br); @@ -517,6 +530,8 @@ void br_received_config_bpdu(struct net_bridge_port *p, /* called under bridge lock */ void br_received_tcn_bpdu(struct net_bridge_port *p) { + p->stp_xstats.rx_tcn++; + if (br_is_designated_port(p)) { br_info(p->br, "port %u(%s) received tcn bpdu\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 7796dd9d42d7..0e4572f31330 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -118,6 +118,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) br_set_ticks(buf+33, bpdu->forward_delay); br_send_bpdu(p, buf, 35); + + p->stp_xstats.tx_bpdu++; } /* called under bridge lock */ @@ -133,6 +135,8 @@ void br_send_tcn_bpdu(struct net_bridge_port *p) buf[2] = 0; buf[3] = BPDU_TYPE_TCN; br_send_bpdu(p, buf, 4); + + p->stp_xstats.tx_tcn++; } /* -- cgit v1.2.3 From 29115cef85781057e78f0c967979edd40b16c209 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 12 Dec 2019 22:35:41 +0100 Subject: net/smc: shorten lgr_cnt initialization Save a line of code by making use of ATOMIC_INIT() for lgr_cnt. Suggested-by: David S. Miller Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bb92c7c6214c..c16b8d3e200b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -41,7 +41,7 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; -static atomic_t lgr_cnt; /* number of existing link groups */ +static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -1297,7 +1297,6 @@ static struct notifier_block smc_reboot_notifier = { int __init smc_core_init(void) { - atomic_set(&lgr_cnt, 0); return register_reboot_notifier(&smc_reboot_notifier); } -- cgit v1.2.3 From 0e627190563e8be6fc9c2cc2a8a3c14bd961754c Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Sun, 15 Dec 2019 11:54:51 -0800 Subject: tcp: Set rcv zerocopy hint correctly if skb last frag is < PAGE_SIZE At present, if the last frag of paged data in a skb has < PAGE_SIZE data, we compute the recv_skip_hint as being equal to the size of that frag and the entire next skb. Instead, just return the runt frag size as the hint. recv_skip_hint is used by the application to skip over bytes that can not be mmaped, so returning a too big chunk is pessimistic and forces more bytes to be copied. Signed-off-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 09e2cae92956..93fe77c5b02d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1778,6 +1778,8 @@ static int tcp_zerocopy_receive(struct sock *sk, while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { if (skb) { + if (zc->recv_skip_hint > 0) + break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { -- cgit v1.2.3 From a6c76c17df021b141b0d306828c9fe4ba2d2717c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:07 +0200 Subject: ipv4: Notify route after insertion to the routing table Currently, a new route is notified in the FIB notification chain before it is inserted to the FIB alias list. Subsequent patches will use the placement of the new route in the ordered FIB alias list in order to determine if the route should be notified or not. As a preparatory step, change the order so that the route is first inserted into the FIB alias list and only then notified. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b9df9c09b84e..9264d6628e9f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1063,9 +1063,6 @@ noleaf: return -ENOMEM; } -/* fib notifier for ADD is sent before calling fib_insert_alias with - * the expectation that the only possible failure ENOMEM - */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1118,6 +1115,9 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) return true; } +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old); + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1269,14 +1269,19 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); + /* Insert new entry to the list. */ + err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; - /* Insert new entry to the list. */ - err = fib_insert_alias(t, tp, l, new_fa, fa, key); + /* The alias was already inserted, so the node must exist. */ + l = l ? l : fib_find_node(t, &tp, key); + if (WARN_ON_ONCE(!l)) + goto out_free_new_fa; + + err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); if (err) - goto out_fib_notif; + goto out_remove_new_fa; if (!plen) tb->tb_num_default++; @@ -1287,14 +1292,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_fib_notif: - /* notifier was sent that entry would be added to trie, but - * the add failed and need to recover. Only failure for - * fib_insert_alias is ENOMEM. - */ - NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, - plen, new_fa, NULL); +out_remove_new_fa: + fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: -- cgit v1.2.3 From b5fc0430dcbb36294b29a25cddb3bf7e92fe2307 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:08 +0200 Subject: ipv4: Extend FIB alias find function Extend the function with another argument, 'find_first'. When set, the function returns the first FIB alias with the matching {prefix, prefix length, table ID}. The TOS and priority parameters are ignored. Current callers are converted to pass 'false' in order to maintain existing behavior. This will be used by subsequent patches in the series. v2: * New patch Signed-off-by: Ido Schimmel Suggested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9264d6628e9f..4c22e768a5b5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -980,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t, /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. + * If 'find_first' is set, return the first matching + * fib alias, regardless of TOS and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, - u8 tos, u32 prio, u32 tb_id) + u8 tos, u32 prio, u32 tb_id, + bool find_first) { struct fib_alias *fa; @@ -998,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, continue; if (fa->tb_id != tb_id) break; + if (find_first) + return fa; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1149,7 +1154,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, - tb->tb_id) : NULL; + tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already @@ -1565,7 +1570,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!l) return -ESRCH; - fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false); if (!fa) return -ESRCH; -- cgit v1.2.3 From ee3936d658821d26de9039c64c90750ba240989d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:09 +0200 Subject: ipv4: Notify route if replacing currently offloaded one When replacing a route, its replacement should only be notified in case the replaced route is of any interest to listeners. In other words, if the replaced route is currently used in the data path, which means it is the first route in the FIB alias list with the given {prefix, prefix length, table ID}. v2: * Convert to use fib_find_alias() instead of fib_find_first_alias() Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4c22e768a5b5..4c80ac0344f4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1222,6 +1222,17 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, + tb->tb_id, true) == fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + err = call_fib_entry_notifiers(net, fib_event, + key, plen, + new_fa, extack); + if (err) + goto out_free_new_fa; + } err = call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, key, plen, new_fa, -- cgit v1.2.3 From a8674f753e36f566d6c1d992ab85323d784281d9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:10 +0200 Subject: ipv4: Notify newly added route if should be offloaded When a route is added, it should only be notified in case it is the first route in the FIB alias list with the given {prefix, prefix length, table ID}. Otherwise, it is not used in the data path and should not be considered by switch drivers. v2: * Convert to use fib_find_alias() instead of fib_find_first_alias() Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4c80ac0344f4..f56945d00d7a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1295,6 +1295,16 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (WARN_ON_ONCE(!l)) goto out_free_new_fa; + if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == + new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + err = call_fib_entry_notifiers(net, fib_event, key, plen, + new_fa, extack); + if (err) + goto out_remove_new_fa; + } err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; -- cgit v1.2.3 From f613b6e2ffe13e9acba0d330a058b87aa0e19a2a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:11 +0200 Subject: ipv4: Handle route deletion notification When a route is deleted we potentially need to promote the next route in the FIB alias list (e.g., with an higher metric). In case we find such a route, a replace notification is emitted. Otherwise, a delete notification for the deleted route. v2: * Convert to use fib_find_alias() instead of fib_find_first_alias() Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f56945d00d7a..012aca433a91 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1570,6 +1570,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, node_pull_suffix(tp, fa->fa_slen); } +static void fib_notify_alias_delete(struct net *net, u32 key, + struct hlist_head *fah, + struct fib_alias *fa_to_delete, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa_next, *fa_to_notify; + u32 tb_id = fa_to_delete->tb_id; + u8 slen = fa_to_delete->fa_slen; + enum fib_event_type fib_event; + + /* Do not notify if we do not care about the route. */ + if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) + return; + + /* Determine if the route should be replaced by the next route in the + * list. + */ + fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, + struct fib_alias, fa_list); + if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { + fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + fa_to_notify = fa_next; + } else { + fib_event = FIB_EVENT_ENTRY_DEL_TMP; + fa_to_notify = fa_to_delete; + } + call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, + fa_to_notify, extack); +} + /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1623,6 +1653,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; + fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, -- cgit v1.2.3 From 525bc345fcbc6f0d58977452999dd18a15dc0e9b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:12 +0200 Subject: ipv4: Handle route deletion notification during flush In a similar fashion to previous patch, when a route is deleted as part of table flushing, promote the next route in the list, if exists. Otherwise, simply emit a delete notification. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 012aca433a91..c23be49ca51c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1979,6 +1979,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) continue; } + fib_notify_alias_delete(net, n->key, &n->leaf, fa, + NULL); call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, KEYLENGTH - fa->fa_slen, fa, -- cgit v1.2.3 From 20d1565203e68cd32d5db3e293e7323173ad7a0d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:13 +0200 Subject: ipv4: Only Replay routes of interest to new listeners When a new listener is registered to the FIB notification chain it receives a dump of all the available routes in the system. Instead, make sure to only replay the IPv4 routes that are actually used in the data path and are of any interest to the new listener. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c23be49ca51c..3f2ff97618ec 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2080,6 +2080,7 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct netlink_ext_ack *extack) { struct fib_alias *fa; + int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { @@ -2099,6 +2100,16 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, fa, extack); if (err) return err; + + if (fa->fa_slen == last_slen) + continue; + + last_slen = fa->fa_slen; + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE_TMP, + l->key, KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } return 0; } -- cgit v1.2.3 From 446f739104f4da6207230363848ec2c89dfd858d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 14 Dec 2019 17:53:15 +0200 Subject: ipv4: Remove old route notifications and convert listeners Unlike mlxsw, the other listeners to the FIB notification chain do not require any special modifications as they never considered multiple identical routes. This patch removes the old route notifications and converts all the listeners to use the new replace / delete notifications. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 4 --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 11 +++---- drivers/net/ethernet/rocker/rocker_main.c | 4 +-- drivers/net/netdevsim/fib.c | 4 +-- include/net/fib_notifier.h | 2 -- net/ipv4/fib_trie.c | 38 ++++------------------ 6 files changed, 16 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c index b70afa310ad2..416676c35b1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c @@ -200,8 +200,6 @@ static void mlx5_lag_fib_update(struct work_struct *work) rtnl_lock(); switch (fib_work->event) { case FIB_EVENT_ENTRY_REPLACE: /* fall through */ - case FIB_EVENT_ENTRY_APPEND: /* fall through */ - case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_DEL: mlx5_lag_fib_route_event(ldev, fib_work->event, fib_work->fen_info.fi); @@ -259,8 +257,6 @@ static int mlx5_lag_fib_event(struct notifier_block *nb, switch (event) { case FIB_EVENT_ENTRY_REPLACE: /* fall through */ - case FIB_EVENT_ENTRY_APPEND: /* fall through */ - case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_DEL: fen_info = container_of(info, struct fib_entry_notifier_info, info); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 396b27b9cdb4..bba1c8215d06 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -6007,14 +6007,14 @@ static void mlxsw_sp_router_fib4_event_work(struct work_struct *work) mlxsw_sp_span_respin(mlxsw_sp); switch (fib_work->event) { - case FIB_EVENT_ENTRY_REPLACE_TMP: + case FIB_EVENT_ENTRY_REPLACE: err = mlxsw_sp_router_fib4_replace(mlxsw_sp, &fib_work->fen_info); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); fib_info_put(fib_work->fen_info.fi); break; - case FIB_EVENT_ENTRY_DEL_TMP: + case FIB_EVENT_ENTRY_DEL: mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info); fib_info_put(fib_work->fen_info.fi); break; @@ -6111,8 +6111,8 @@ static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work, struct fib_nh_notifier_info *fnh_info; switch (fib_work->event) { - case FIB_EVENT_ENTRY_REPLACE_TMP: /* fall through */ - case FIB_EVENT_ENTRY_DEL_TMP: + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ + case FIB_EVENT_ENTRY_DEL: fen_info = container_of(info, struct fib_entry_notifier_info, info); fib_work->fen_info = *fen_info; @@ -6243,8 +6243,7 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, router->mlxsw_sp); return notifier_from_errno(err); case FIB_EVENT_ENTRY_ADD: /* fall through */ - case FIB_EVENT_ENTRY_REPLACE: /* fall through */ - case FIB_EVENT_ENTRY_REPLACE_TMP: + case FIB_EVENT_ENTRY_REPLACE: if (router->aborted) { NL_SET_ERR_MSG_MOD(info->extack, "FIB offload was aborted. Not configuring route"); return notifier_from_errno(-EINVAL); diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index bc4f951315da..7585cd2270ba 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2159,7 +2159,7 @@ static void rocker_router_fib_event_work(struct work_struct *work) /* Protect internal structures from changes */ rtnl_lock(); switch (fib_work->event) { - case FIB_EVENT_ENTRY_ADD: + case FIB_EVENT_ENTRY_REPLACE: err = rocker_world_fib4_add(rocker, &fib_work->fen_info); if (err) rocker_world_fib4_abort(rocker); @@ -2201,7 +2201,7 @@ static int rocker_router_fib_event(struct notifier_block *nb, fib_work->event = event; switch (event) { - case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ case FIB_EVENT_ENTRY_DEL: if (info->family == AF_INET) { struct fib_entry_notifier_info *fen_info = ptr; diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 13540dee7364..4e02a4231fcb 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -177,10 +177,10 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, event == FIB_EVENT_RULE_ADD); break; + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_DEL: - err = nsim_fib_event(data, info, - event == FIB_EVENT_ENTRY_ADD); + err = nsim_fib_event(data, info, event != FIB_EVENT_ENTRY_DEL); break; } diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index b3c54325caec..6d59221ff05a 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -23,8 +23,6 @@ enum fib_event_type { FIB_EVENT_NH_DEL, FIB_EVENT_VIF_ADD, FIB_EVENT_VIF_DEL, - FIB_EVENT_ENTRY_REPLACE_TMP, - FIB_EVENT_ENTRY_DEL_TMP, }; struct fib_notifier_ops { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3f2ff97618ec..b92a42433a7d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1127,7 +1127,6 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1226,19 +1225,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_id, true) == fa) { enum fib_event_type fib_event; - fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_free_new_fa; } - err = call_fib_entry_notifiers(net, - FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, - extack); - if (err) - goto out_free_new_fa; rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1260,12 +1253,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) { - event = FIB_EVENT_ENTRY_APPEND; + if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - } else { + else fa = fa_first; - } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1299,15 +1290,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa) { enum fib_event_type fib_event; - fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; } - err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); - if (err) - goto out_remove_new_fa; if (!plen) tb->tb_num_default++; @@ -1590,10 +1578,10 @@ static void fib_notify_alias_delete(struct net *net, u32 key, fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, struct fib_alias, fa_list); if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { - fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + fib_event = FIB_EVENT_ENTRY_REPLACE; fa_to_notify = fa_next; } else { - fib_event = FIB_EVENT_ENTRY_DEL_TMP; + fib_event = FIB_EVENT_ENTRY_DEL; fa_to_notify = fa_to_delete; } call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, @@ -1654,8 +1642,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1981,10 +1967,6 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2095,17 +2077,11 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, if (tb->tb_id != fa->tb_id) continue; - err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, - fa, extack); - if (err) - return err; - if (fa->fa_slen == last_slen) continue; last_slen = fa->fa_slen; - err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE_TMP, + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) -- cgit v1.2.3 From 871185ace40df871a93866b2a7ce441276fc4ee8 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 20 Nov 2019 12:33:59 +0100 Subject: netfilter: Clean up unnecessary #ifdef If CONFIG_NETFILTER_INGRESS is not enabled, nf_ingress() becomes a no-op because it solely contains an if-clause calling nf_hook_ingress_active(), for which an empty inline stub exists in . All the symbols used in the if-clause's body are still available even if CONFIG_NETFILTER_INGRESS is not enabled. The additional "#ifdef CONFIG_NETFILTER_INGRESS" in nf_ingress() is thus unnecessary, so drop it. Signed-off-by: Lukas Wunner Cc: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 2c277b8aba38..1ccead4b19bf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4932,7 +4932,6 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { -#ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { int ingress_retval; @@ -4946,7 +4945,6 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, rcu_read_unlock(); return ingress_retval; } -#endif /* CONFIG_NETFILTER_INGRESS */ return 0; } -- cgit v1.2.3 From cf3e204a1ca5442190018a317d9ec181b4639bd6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 13 Dec 2019 16:53:05 +0800 Subject: netfilter: nft_tunnel: no need to call htons() when dumping ports info->key.tp_src and tp_dst are __be16, when using nla_put_be16() to dump them, htons() is not needed, so remove it in this patch. Fixes: af308b94a2a4 ("netfilter: nf_tables: add tunnel support") Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3d4c2ae605a8..ef2065dd4f8a 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -501,8 +501,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { - if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || - nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; -- cgit v1.2.3 From 0705f95c332081036d85f26691e9d3cd7d901c31 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 13 Dec 2019 16:53:06 +0800 Subject: netfilter: nft_tunnel: add the missing ERSPAN_VERSION nla_policy ERSPAN_VERSION is an attribute parsed in kernel side, nla_policy type should be added for it, like other attributes. Fixes: af308b94a2a4 ("netfilter: nf_tables: add tunnel support") Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index ef2065dd4f8a..6538895466e0 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, - [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; -- cgit v1.2.3 From 2149f36dbd44f2d6bf5357e5d096205b92cd854a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 13 Dec 2019 16:53:07 +0800 Subject: netfilter: nft_tunnel: also dump ERSPAN_VERSION This is not necessary, but it'll be easier to parse in userspace, also given that other places like act_tunnel_key, cls_flower and ip_tunnel_core are also doing so. Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 6538895466e0..b3a9b10ff43d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -479,6 +479,9 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, htonl(opts->u.vxlan.gbp))) return -1; } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, + htonl(opts->u.erspan.version))) + return -1; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, -- cgit v1.2.3 From 73239bd9707ab2f3b7621e50468c14410cf4e2c2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 13 Dec 2019 16:53:08 +0800 Subject: netfilter: nft_tunnel: also dump OPTS_ERSPAN/VXLAN This patch is to add the nest attr OPTS_ERSPAN/VXLAN when dumping KEY_OPTS, and it would be helpful when parsing in userpace. Also, this is needed for supporting multiple geneve opts in the future patches. Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index b3a9b10ff43d..eb1740236526 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -468,17 +468,24 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_obj *priv) { struct nft_tunnel_opts *opts = &priv->opts; - struct nlattr *nest; + struct nlattr *nest, *inner; nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; if (opts->flags & TUNNEL_VXLAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); + if (!inner) + return -1; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) return -1; + nla_nest_end(skb, inner); } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); + if (!inner) + return -1; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, htonl(opts->u.erspan.version))) return -1; @@ -496,6 +503,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, return -1; break; } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); -- cgit v1.2.3 From 7e03998429ee0a27c13b8690c11f9cf40f67f6e9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 13 Dec 2019 16:53:09 +0800 Subject: netfilter: nft_tunnel: add the missing nla_nest_cancel() When nla_put_xxx() fails under nla_nest_start_noflag(), nla_nest_cancel() should be called, so that the skb can be trimmed properly. Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index eb1740236526..23cd163689d5 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -443,10 +443,15 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || - nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || - nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) + if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, + &info->key.u.ipv6.src) < 0 || + nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, + &info->key.u.ipv6.dst) < 0 || + nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + info->key.label)) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } else { @@ -454,9 +459,13 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || - nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) + if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, + info->key.u.ipv4.src) < 0 || + nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, + info->key.u.ipv4.dst) < 0) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } @@ -477,37 +486,42 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (opts->flags & TUNNEL_VXLAN_OPT) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) - return -1; + goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) - return -1; + goto inner_failure; nla_nest_end(skb, inner); } else if (opts->flags & TUNNEL_ERSPAN_OPT) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) - return -1; + goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, htonl(opts->u.erspan.version))) - return -1; + goto inner_failure; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, opts->u.erspan.u.index)) - return -1; + goto inner_failure; break; case ERSPAN_VERSION2: if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, get_hwid(&opts->u.erspan.u.md2)) || nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, opts->u.erspan.u.md2.dir)) - return -1; + goto inner_failure; break; } nla_nest_end(skb, inner); } nla_nest_end(skb, nest); - return 0; + +inner_failure: + nla_nest_cancel(skb, inner); +failure: + nla_nest_cancel(skb, nest); + return -1; } static int nft_tunnel_ports_dump(struct sk_buff *skb, -- cgit v1.2.3 From 13d74c0a9708a4f1ab0164a800ce9ea3de32f47b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 13 Dec 2019 01:58:15 +0100 Subject: netfilter: conntrack: remove two export symbols Not used anywhere, remove them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 1 - net/netfilter/nf_conntrack_extend.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0af1898af2b8..983a9481e8f8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2333,7 +2333,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return nf_conntrack_hash_resize(hashsize); } -EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); static __always_inline unsigned int total_extension_size(void) { diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index c24e5b64b00c..3dbe2329c3f1 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -37,7 +37,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct) kfree(ct->ext); } -EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { -- cgit v1.2.3 From b7ffa045e7000d5410bf206454e0cb8de0428ed5 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 16 Dec 2019 19:21:02 +0100 Subject: tipc: don't send gap blocks in ACK messages In the commit referred to below we eliminated sending of the 'gap' indicator in regular ACK messages, reserving this to explicit NACK ditto. Unfortunately we missed to also eliminate building of the 'gap block' area in ACK messages. This area is meant to report gaps in the received packet sequence following the initial gap, so that lost packets can be retransmitted earlier and received out-of-sequence packets can be released earlier. However, the interpretation of those blocks is dependent on a complete and correct sequence of gaps and acks. Hence, when the initial gap indicator is missing a single gap block will be interpreted as an acknowledgment of all preceding packets. This may lead to packets being released prematurely from the sender's transmit queue, with easily predicatble consequences. We now fix this by not building any gap block area if there is no initial gap to report. Fixes: commit 02288248b051 ("tipc: eliminate gap indicator from ACK messages") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 94dd48cd70a3..467c53a1fb5c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -250,7 +250,7 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static int tipc_link_release_pkts(struct tipc_link *l, u16 to); -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap); static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq); @@ -1423,14 +1423,14 @@ static int tipc_link_release_pkts(struct tipc_link *l, u16 acked) * * returns the actual allocated memory size */ -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data) +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) { struct sk_buff *skb = skb_peek(&l->deferdq); struct tipc_gap_ack_blks *ga = data; u16 len, expect, seqno = 0; u8 n = 0; - if (!skb) + if (!skb || !gap) goto exit; expect = buf_seqno(skb); @@ -1739,7 +1739,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) - glen = tipc_build_gap_ack_blks(l, data); + glen = tipc_build_gap_ack_blks(l, data, rcvgap); tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + glen + dlen); skb_trim(skb, INT_H_SIZE + glen + dlen); -- cgit v1.2.3 From 8ae674964e67eb7deb3f0e489bfe9c102a7990b0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 16 Dec 2019 10:32:47 -0800 Subject: net: dsa: Make PHYLINK related function static again Commit 77373d49de22 ("net: dsa: Move the phylink driver calls into port.c") moved and exported a bunch of symbols, but they are not used outside of net/dsa/port.c at the moment, so no reason to export them. Reported-by: Russell King Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Acked-by: Russell King Acked-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 16 ---------------- net/dsa/port.c | 38 ++++++++++++++++---------------------- 2 files changed, 16 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2dd86d9bcda9..09ea2fd78c74 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -150,22 +150,6 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); -void dsa_port_phylink_validate(struct phylink_config *config, - unsigned long *supported, - struct phylink_link_state *state); -void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state); -void dsa_port_phylink_mac_config(struct phylink_config *config, - unsigned int mode, - const struct phylink_link_state *state); -void dsa_port_phylink_mac_an_restart(struct phylink_config *config); -void dsa_port_phylink_mac_link_down(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface); -void dsa_port_phylink_mac_link_up(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; /* slave.c */ diff --git a/net/dsa/port.c b/net/dsa/port.c index 46ac9ba21987..ffb5601f7ed6 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -415,9 +415,9 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return phydev; } -void dsa_port_phylink_validate(struct phylink_config *config, - unsigned long *supported, - struct phylink_link_state *state) +static void dsa_port_phylink_validate(struct phylink_config *config, + unsigned long *supported, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -427,10 +427,9 @@ void dsa_port_phylink_validate(struct phylink_config *config, ds->ops->phylink_validate(ds, dp->index, supported, state); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_validate); -void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state) +static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -444,11 +443,10 @@ void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0) state->link = 0; } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_pcs_get_state); -void dsa_port_phylink_mac_config(struct phylink_config *config, - unsigned int mode, - const struct phylink_link_state *state) +static void dsa_port_phylink_mac_config(struct phylink_config *config, + unsigned int mode, + const struct phylink_link_state *state) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -458,9 +456,8 @@ void dsa_port_phylink_mac_config(struct phylink_config *config, ds->ops->phylink_mac_config(ds, dp->index, mode, state); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config); -void dsa_port_phylink_mac_an_restart(struct phylink_config *config) +static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -470,11 +467,10 @@ void dsa_port_phylink_mac_an_restart(struct phylink_config *config) ds->ops->phylink_mac_an_restart(ds, dp->index); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart); -void dsa_port_phylink_mac_link_down(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface) +static void dsa_port_phylink_mac_link_down(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct phy_device *phydev = NULL; @@ -491,12 +487,11 @@ void dsa_port_phylink_mac_link_down(struct phylink_config *config, ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down); -void dsa_port_phylink_mac_link_up(struct phylink_config *config, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev) +static void dsa_port_phylink_mac_link_up(struct phylink_config *config, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) { struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; @@ -509,7 +504,6 @@ void dsa_port_phylink_mac_link_up(struct phylink_config *config, ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); } -EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up); const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, -- cgit v1.2.3 From cbd22f172df782d6b2150044d0ff2051bad2a00f Mon Sep 17 00:00:00 2001 From: Kevin 'ldir' Darbyshire-Bryant Date: Wed, 18 Dec 2019 14:05:13 +0000 Subject: sch_cake: drop unused variable tin_quantum_prio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out tin_quantum_prio isn't used anymore and is a leftover from a previous implementation of diffserv tins. Since the variable isn't used in any calculations it can be eliminated. Drop variable and places where it was set. Rename remaining variable and consolidate naming of intermediate variables that set it. Signed-off-by: Kevin Darbyshire-Bryant Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 59 ++++++++++++++++------------------------------------ 1 file changed, 18 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index e0f40400f679..6cc3ab145513 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -173,8 +173,7 @@ struct cake_tin_data { u64 tin_rate_bps; u16 tin_rate_shft; - u16 tin_quantum_prio; - u16 tin_quantum_band; + u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; @@ -1919,7 +1918,7 @@ begin: while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) - b->tin_deficit += b->tin_quantum_band; + b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; @@ -2241,8 +2240,7 @@ static int cake_config_besteffort(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_band = 65535; - b->tin_quantum_prio = 65535; + b->tin_quantum = 65535; return 0; } @@ -2253,8 +2251,7 @@ static int cake_config_precedence(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2267,18 +2264,14 @@ static int cake_config_precedence(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2347,8 +2340,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; - u32 quantum1 = 256; - u32 quantum2 = 256; + u32 quantum = 256; u32 i; q->tin_cnt = 8; @@ -2364,18 +2356,14 @@ static int cake_config_diffserv8(struct Qdisc *sch) cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - b->tin_quantum_prio = max_t(u16, 1U, quantum1); - b->tin_quantum_band = max_t(u16, 1U, quantum2); + b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; - quantum1 *= 3; - quantum1 >>= 1; - - quantum2 *= 7; - quantum2 >>= 3; + quantum *= 7; + quantum >>= 3; } return 0; @@ -2414,17 +2402,11 @@ static int cake_config_diffserv4(struct Qdisc *sch) cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 2; - q->tins[3].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 1; - q->tins[3].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 1; + q->tins[3].tin_quantum = quantum >> 2; return 0; } @@ -2455,15 +2437,10 @@ static int cake_config_diffserv3(struct Qdisc *sch) cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); - /* priority weights */ - q->tins[0].tin_quantum_prio = quantum; - q->tins[1].tin_quantum_prio = quantum >> 4; - q->tins[2].tin_quantum_prio = quantum << 4; - /* bandwidth-sharing weights */ - q->tins[0].tin_quantum_band = quantum; - q->tins[1].tin_quantum_band = quantum >> 4; - q->tins[2].tin_quantum_band = quantum >> 2; + q->tins[0].tin_quantum = quantum; + q->tins[1].tin_quantum = quantum >> 4; + q->tins[2].tin_quantum = quantum >> 2; return 0; } -- cgit v1.2.3 From dcc68b4d8084e1ac9af0d4022d6b1aff6a139a33 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2019 14:55:13 +0000 Subject: net: sch_ets: Add a new Qdisc Introduces a new Qdisc, which is based on 802.1Q-2014 wording. It is PRIO-like in how it is configured, meaning one needs to specify how many bands there are, how many are strict and how many are dwrr, quanta for the latter, and priomap. The new Qdisc operates like the PRIO / DRR combo would when configured as per the standard. The strict classes, if any, are tried for traffic first. When there's no traffic in any of the strict queues, the ETS ones (if any) are treated in the same way as in DRR. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 17 + net/sched/Kconfig | 17 + net/sched/Makefile | 1 + net/sched/sch_ets.c | 733 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 768 insertions(+) create mode 100644 net/sched/sch_ets.c (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9f1a72876212..bf5a5b1dfb0b 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1187,4 +1187,21 @@ enum { #define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) +/* ETS */ + +#define TCQ_ETS_MAX_BANDS 16 + +enum { + TCA_ETS_UNSPEC, + TCA_ETS_NBANDS, /* u8 */ + TCA_ETS_NSTRICT, /* u8 */ + TCA_ETS_QUANTA, /* nested TCA_ETS_QUANTA_BAND */ + TCA_ETS_QUANTA_BAND, /* u32 */ + TCA_ETS_PRIOMAP, /* nested TCA_ETS_PRIOMAP_BAND */ + TCA_ETS_PRIOMAP_BAND, /* u8 */ + __TCA_ETS_MAX, +}; + +#define TCA_ETS_MAX (__TCA_ETS_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2985509147a2..b1e7ec726958 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -409,6 +409,23 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. +config NET_SCH_ETS + tristate "Enhanced transmission selection scheduler (ETS)" + help + The Enhanced Transmission Selection scheduler is a classful + queuing discipline that merges functionality of PRIO and DRR + qdiscs in one scheduler. ETS makes it easy to configure a set of + strict and bandwidth-sharing bands to implement the transmission + selection described in 802.1Qaz. + + Say Y here if you want to use the ETS packet scheduling + algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_ets. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 415d1e1f237e..bc8856b865ff 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o +obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c new file mode 100644 index 000000000000..e6194b23e9b0 --- /dev/null +++ b/net/sched/sch_ets.c @@ -0,0 +1,733 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * net/sched/sch_ets.c Enhanced Transmission Selection scheduler + * + * Description + * ----------- + * + * The Enhanced Transmission Selection scheduler is a classful queuing + * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler. + * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to + * implement the transmission selection described in 802.1Qaz. + * + * Although ETS is technically classful, it's not possible to add and remove + * classes at will. Instead one specifies number of classes, how many are + * PRIO-like and how many DRR-like, and quanta for the latter. + * + * Algorithm + * --------- + * + * The strict classes, if any, are tried for traffic first: first band 0, if it + * has no traffic then band 1, etc. + * + * When there is no traffic in any of the strict queues, the bandwidth-sharing + * ones are tried next. Each band is assigned a deficit counter, initialized to + * "quantum" of that band. ETS maintains a list of active bandwidth-sharing + * bands whose qdiscs are non-empty. A packet is dequeued from the band at the + * head of the list if the packet size is smaller or equal to the deficit + * counter. If the counter is too small, it is increased by "quantum" and the + * scheduler moves on to the next band in the active list. + */ + +#include +#include +#include +#include +#include +#include + +struct ets_class { + struct list_head alist; /* In struct ets_sched.active. */ + struct Qdisc *qdisc; + u32 quantum; + u32 deficit; + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; +}; + +struct ets_sched { + struct list_head active; + struct tcf_proto __rcu *filter_list; + struct tcf_block *block; + unsigned int nbands; + unsigned int nstrict; + u8 prio2band[TC_PRIO_MAX + 1]; + struct ets_class classes[TCQ_ETS_MAX_BANDS]; +}; + +static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_NBANDS] = { .type = NLA_U8 }, + [TCA_ETS_NSTRICT] = { .type = NLA_U8 }, + [TCA_ETS_QUANTA] = { .type = NLA_NESTED }, + [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 }, +}; + +static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { + [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, +}; + +static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, + unsigned int *quantum, + struct netlink_ext_ack *extack) +{ + *quantum = nla_get_u32(attr); + if (!*quantum) { + NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero"); + return -EINVAL; + } + return 0; +} + +static struct ets_class * +ets_class_from_arg(struct Qdisc *sch, unsigned long arg) +{ + struct ets_sched *q = qdisc_priv(sch); + + return &q->classes[arg - 1]; +} + +static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) +{ + struct ets_sched *q = qdisc_priv(sch); + int band = cl - q->classes; + + return TC_H_MAKE(sch->handle, band + 1); +} + +static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) +{ + unsigned int band = cl - q->classes; + + return band < q->nstrict; +} + +static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, *arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int quantum; + int err; + + /* Classes can be added and removed only through Qdisc_ops.change + * interface. + */ + if (!cl) { + NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported"); + return -EOPNOTSUPP; + } + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_QUANTA_BAND]) + /* Nothing to configure. */ + return 0; + + if (ets_class_is_strict(q, cl)) { + NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum"); + return -EINVAL; + } + + err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum, + extack); + if (err) + return err; + + sch_tree_lock(sch); + cl->quantum = quantum; + sch_tree_unlock(sch); + return 0; +} + +static int ets_class_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, cl), NULL); + if (!new) + new = &noop_qdisc; + else + qdisc_hash_add(new, true); + } + + *old = qdisc_replace(sch, new, &cl->qdisc); + return 0; +} + +static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + + return cl->qdisc; +} + +static unsigned long ets_class_find(struct Qdisc *sch, u32 classid) +{ + unsigned long band = TC_H_MIN(classid); + struct ets_sched *q = qdisc_priv(sch); + + if (band - 1 >= q->nbands) + return 0; + return band; +} + +static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + + /* We get notified about zero-length child Qdiscs as well if they are + * offloaded. Those aren't on the active list though, so don't attempt + * to remove them. + */ + if (!ets_class_is_strict(q, cl) && sch->q.qlen) + list_del(&cl->alist); +} + +static int ets_class_dump(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = ets_class_id(sch, cl); + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + if (!ets_class_is_strict(q, cl)) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum)) + goto nla_put_failure; + } + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct Qdisc *cl_q = cl->qdisc; + + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + + return 0; +} + +static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct ets_sched *q = qdisc_priv(sch); + int i; + + if (arg->stop) + return; + + for (i = 0; i < q->nbands; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_block * +ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + + if (cl) { + NL_SET_ERR_MSG(extack, "ETS classid must be zero"); + return NULL; + } + + return q->block; +} + +static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return ets_class_find(sch, classid); +} + +static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ +} + +static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct ets_sched *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + struct tcf_proto *fl; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + if (TC_H_MAJ(skb->priority) != sch->handle) { + fl = rcu_dereference_bh(q->filter_list); + err = tcf_classify(skb, fl, &res, false); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return NULL; + } +#endif + if (!fl || err < 0) { + if (TC_H_MAJ(band)) + band = 0; + return &q->classes[q->prio2band[band & TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band >= q->nbands) + return &q->classes[q->prio2band[0]]; + return &q->classes[band]; +} + +static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + unsigned int len = qdisc_pkt_len(skb); + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + int err = 0; + bool first; + + cl = ets_classify(skb, sch, &err); + if (!cl) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + first = !cl->qdisc->q.qlen; + err = qdisc_enqueue(skb, cl->qdisc, to_free); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + qdisc_qstats_drop(sch); + } + return err; + } + + if (first && !ets_class_is_strict(q, cl)) { + list_add_tail(&cl->alist, &q->active); + cl->deficit = cl->quantum; + } + + sch->qstats.backlog += len; + sch->q.qlen++; + return err; +} + +static struct sk_buff * +ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + return skb; +} + +static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + struct ets_class *cl; + struct sk_buff *skb; + unsigned int band; + unsigned int len; + + while (1) { + for (band = 0; band < q->nstrict; band++) { + cl = &q->classes[band]; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (skb) + return ets_qdisc_dequeue_skb(sch, skb); + } + + if (list_empty(&q->active)) + goto out; + + cl = list_first_entry(&q->active, struct ets_class, alist); + skb = cl->qdisc->ops->peek(cl->qdisc); + if (!skb) { + qdisc_warn_nonwc(__func__, cl->qdisc); + goto out; + } + + len = qdisc_pkt_len(skb); + if (len <= cl->deficit) { + cl->deficit -= len; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (unlikely(!skb)) + goto out; + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + return ets_qdisc_dequeue_skb(sch, skb); + } + + cl->deficit += cl->quantum; + list_move_tail(&cl->alist, &q->active); + } +out: + return NULL; +} + +static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr, + unsigned int nbands, u8 *priomap, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int prio = 0; + u8 band; + int rem; + int err; + + err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX, + ets_priomap_policy, NL_VALIDATE_STRICT, + extack); + if (err) + return err; + + nla_for_each_nested(attr, priomap_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_PRIOMAP_BAND: + if (prio > TC_PRIO_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap"); + return -EINVAL; + } + band = nla_get_u8(attr); + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap"); + return -EINVAL; + } + priomap[prio++] = band; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr, + unsigned int nbands, unsigned int nstrict, + unsigned int *quanta, + struct netlink_ext_ack *extack) +{ + const struct nlattr *attr; + int band = nstrict; + int rem; + int err; + + err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX, + ets_quanta_policy, NL_VALIDATE_STRICT, + extack); + if (err < 0) + return err; + + nla_for_each_nested(attr, quanta_attr, rem) { + switch (nla_type(attr)) { + case TCA_ETS_QUANTA_BAND: + if (band >= nbands) { + NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands"); + return -EINVAL; + } + err = ets_quantum_parse(sch, attr, &quanta[band++], + extack); + if (err) + return err; + break; + default: + WARN_ON_ONCE(1); /* Validate should have caught this. */ + return -EINVAL; + } + } + + return 0; +} + +static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0}; + struct Qdisc *queues[TCQ_ETS_MAX_BANDS]; + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *tb[TCA_ETS_MAX + 1]; + unsigned int oldbands = q->nbands; + u8 priomap[TC_PRIO_MAX + 1]; + unsigned int nstrict = 0; + unsigned int nbands; + unsigned int i; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETS_NBANDS]) { + NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument"); + return -EINVAL; + } + nbands = nla_get_u8(tb[TCA_ETS_NBANDS]); + if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands"); + return -EINVAL; + } + /* Unless overridden, traffic goes to the last band. */ + memset(priomap, nbands - 1, sizeof(priomap)); + + if (tb[TCA_ETS_NSTRICT]) { + nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]); + if (nstrict > nbands) { + NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands"); + return -EINVAL; + } + } + + if (tb[TCA_ETS_PRIOMAP]) { + err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP], + nbands, priomap, extack); + if (err) + return err; + } + + if (tb[TCA_ETS_QUANTA]) { + err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA], + nbands, nstrict, quanta, extack); + if (err) + return err; + } + /* If there are more bands than strict + quanta provided, the remaining + * ones are ETS with quantum of MTU. Initialize the missing values here. + */ + for (i = nstrict; i < nbands; i++) { + if (!quanta[i]) + quanta[i] = psched_mtu(qdisc_dev(sch)); + } + + /* Before commit, make sure we can allocate all new qdiscs */ + for (i = oldbands; i < nbands; i++) { + queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + ets_class_id(sch, &q->classes[i]), + extack); + if (!queues[i]) { + while (i > oldbands) + qdisc_put(queues[--i]); + return -ENOMEM; + } + } + + sch_tree_lock(sch); + + q->nbands = nbands; + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); + + for (i = q->nbands; i < oldbands; i++) + qdisc_tree_flush_backlog(q->classes[i].qdisc); + + for (i = 0; i < q->nbands; i++) + q->classes[i].quantum = quanta[i]; + + for (i = oldbands; i < q->nbands; i++) { + q->classes[i].qdisc = queues[i]; + if (q->classes[i].qdisc != &noop_qdisc) + qdisc_hash_add(q->classes[i].qdisc, true); + } + + sch_tree_unlock(sch); + + for (i = q->nbands; i < oldbands; i++) { + qdisc_put(q->classes[i].qdisc); + memset(&q->classes[i], 0, sizeof(q->classes[i])); + } + return 0; +} + +static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct ets_sched *q = qdisc_priv(sch); + int err; + + if (!opt) + return -EINVAL; + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + INIT_LIST_HEAD(&q->active); + return ets_qdisc_change(sch, opt, extack); +} + +static void ets_qdisc_reset(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + for (band = q->nstrict; band < q->nbands; band++) { + if (q->classes[band].qdisc->q.qlen) + list_del(&q->classes[band].alist); + } + for (band = 0; band < q->nbands; band++) + qdisc_reset(q->classes[band].qdisc); + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + +static void ets_qdisc_destroy(struct Qdisc *sch) +{ + struct ets_sched *q = qdisc_priv(sch); + int band; + + tcf_block_put(q->block); + for (band = 0; band < q->nbands; band++) + qdisc_put(q->classes[band].qdisc); +} + +static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct ets_sched *q = qdisc_priv(sch); + struct nlattr *opts; + struct nlattr *nest; + int band; + int prio; + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_err; + + if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + goto nla_err; + + if (q->nstrict && + nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + goto nla_err; + + if (q->nbands > q->nstrict) { + nest = nla_nest_start(skb, TCA_ETS_QUANTA); + if (!nest) + goto nla_err; + + for (band = q->nstrict; band < q->nbands; band++) { + if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, + q->classes[band].quantum)) + goto nla_err; + } + + nla_nest_end(skb, nest); + } + + nest = nla_nest_start(skb, TCA_ETS_PRIOMAP); + if (!nest) + goto nla_err; + + for (prio = 0; prio <= TC_PRIO_MAX; prio++) { + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + goto nla_err; + } + + nla_nest_end(skb, nest); + + return nla_nest_end(skb, opts); + +nla_err: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static const struct Qdisc_class_ops ets_class_ops = { + .change = ets_class_change, + .graft = ets_class_graft, + .leaf = ets_class_leaf, + .find = ets_class_find, + .qlen_notify = ets_class_qlen_notify, + .dump = ets_class_dump, + .dump_stats = ets_class_dump_stats, + .walk = ets_qdisc_walk, + .tcf_block = ets_qdisc_tcf_block, + .bind_tcf = ets_qdisc_bind_tcf, + .unbind_tcf = ets_qdisc_unbind_tcf, +}; + +static struct Qdisc_ops ets_qdisc_ops __read_mostly = { + .cl_ops = &ets_class_ops, + .id = "ets", + .priv_size = sizeof(struct ets_sched), + .enqueue = ets_qdisc_enqueue, + .dequeue = ets_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .change = ets_qdisc_change, + .init = ets_qdisc_init, + .reset = ets_qdisc_reset, + .destroy = ets_qdisc_destroy, + .dump = ets_qdisc_dump, + .owner = THIS_MODULE, +}; + +static int __init ets_init(void) +{ + return register_qdisc(&ets_qdisc_ops); +} + +static void __exit ets_exit(void) +{ + unregister_qdisc(&ets_qdisc_ops); +} + +module_init(ets_init); +module_exit(ets_exit); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From d35eb52bd2ac7557b62bda52668f2e64dde2cf90 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2019 14:55:15 +0000 Subject: net: sch_ets: Make the ETS qdisc offloadable Add hooks at appropriate points to make it possible to offload the ETS Qdisc. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 31 ++++++++++++++++ net/sched/sch_ets.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 30745068fb39..7a8ed11f5d45 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -849,6 +849,7 @@ enum tc_setup_type { TC_SETUP_QDISC_GRED, TC_SETUP_QDISC_TAPRIO, TC_SETUP_FT, + TC_SETUP_QDISC_ETS, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a7c5d492bc04..47b115e2012a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -823,4 +823,35 @@ struct tc_root_qopt_offload { bool ingress; }; +enum tc_ets_command { + TC_ETS_REPLACE, + TC_ETS_DESTROY, + TC_ETS_STATS, + TC_ETS_GRAFT, +}; + +struct tc_ets_qopt_offload_replace_params { + unsigned int bands; + u8 priomap[TC_PRIO_MAX + 1]; + unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ + unsigned int weights[TCQ_ETS_MAX_BANDS]; + struct gnet_stats_queue *qstats; +}; + +struct tc_ets_qopt_offload_graft_params { + u8 band; + u32 child_handle; +}; + +struct tc_ets_qopt_offload { + enum tc_ets_command command; + u32 handle; + u32 parent; + union { + struct tc_ets_qopt_offload_replace_params replace_params; + struct tc_qopt_offload_stats stats; + struct tc_ets_qopt_offload_graft_params graft_params; + }; +}; + #endif diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index e6194b23e9b0..a87e9159338c 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -102,6 +102,91 @@ static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) return TC_H_MAKE(sch->handle, band + 1); } +static void ets_offload_change(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct ets_sched *q = qdisc_priv(sch); + struct tc_ets_qopt_offload qopt; + unsigned int w_psum_prev = 0; + unsigned int q_psum = 0; + unsigned int q_sum = 0; + unsigned int quantum; + unsigned int w_psum; + unsigned int weight; + unsigned int i; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.bands = q->nbands; + qopt.replace_params.qstats = &sch->qstats; + memcpy(&qopt.replace_params.priomap, + q->prio2band, sizeof(q->prio2band)); + + for (i = 0; i < q->nbands; i++) + q_sum += q->classes[i].quantum; + + for (i = 0; i < q->nbands; i++) { + quantum = q->classes[i].quantum; + q_psum += quantum; + w_psum = quantum ? q_psum * 100 / q_sum : 0; + weight = w_psum - w_psum_prev; + w_psum_prev = w_psum; + + qopt.replace_params.quanta[i] = quantum; + qopt.replace_params.weights[i] = weight; + } + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_ETS_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); +} + +static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc *old, unsigned long arg, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_GRAFT; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.graft_params.band = arg - 1; + qopt.graft_params.child_handle = new->handle; + + qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS, + &qopt, extack); +} + +static int ets_offload_dump(struct Qdisc *sch) +{ + struct tc_ets_qopt_offload qopt; + + qopt.command = TC_ETS_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt); +} + static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) { unsigned int band = cl - q->classes; @@ -154,6 +239,8 @@ static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, sch_tree_lock(sch); cl->quantum = quantum; sch_tree_unlock(sch); + + ets_offload_change(sch); return 0; } @@ -173,6 +260,7 @@ static int ets_class_graft(struct Qdisc *sch, unsigned long arg, } *old = qdisc_replace(sch, new, &cl->qdisc); + ets_offload_graft(sch, new, *old, arg, extack); return 0; } @@ -589,6 +677,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_unlock(sch); + ets_offload_change(sch); for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); memset(&q->classes[i], 0, sizeof(q->classes[i])); @@ -633,6 +722,7 @@ static void ets_qdisc_destroy(struct Qdisc *sch) struct ets_sched *q = qdisc_priv(sch); int band; + ets_offload_destroy(sch); tcf_block_put(q->block); for (band = 0; band < q->nbands; band++) qdisc_put(q->classes[band].qdisc); @@ -645,6 +735,11 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest; int band; int prio; + int err; + + err = ets_offload_dump(sch); + if (err) + return err; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) -- cgit v1.2.3 From 6de6c1f840c051017f2308503858ff19344c56b3 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Wed, 18 Dec 2019 12:57:47 -0800 Subject: bpf: Allow to change skb mark in test_run allow to pass skb's mark field into bpf_prog_test_run ctx for BPF_PROG_TYPE_SCHED_CLS prog type. that would allow to test bpf programs which are doing decision based on this field Signed-off-by: Nikita V. Shirokov Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 10 +++++++++- tools/testing/selftests/bpf/prog_tests/skb_ctx.c | 5 +++++ tools/testing/selftests/bpf/progs/test_skb_ctx.c | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 93a9c87787e0..d555c0d8657d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -251,7 +251,13 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return 0; /* make sure the fields we don't use are zeroed */ - if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) + return -EINVAL; + + /* mark is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), + offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ @@ -274,6 +280,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) sizeof(struct __sk_buff))) return -EINVAL; + skb->mark = __skb->mark; skb->priority = __skb->priority; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); @@ -301,6 +308,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) if (!__skb) return; + __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index edf5e8c7d400..c6d6b685a946 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -13,6 +13,7 @@ void test_skb_ctx(void) .tstamp = 7, .wire_len = 100, .gso_segs = 8, + .mark = 9, }; struct bpf_prog_test_run_attr tattr = { .data_in = &pkt_v4, @@ -93,4 +94,8 @@ void test_skb_ctx(void) "ctx_out_tstamp", "skb->tstamp == %lld, expected %d\n", skb.tstamp, 8); + CHECK_ATTR(skb.mark != 10, + "ctx_out_mark", + "skb->mark == %u, expected %d\n", + skb.mark, 10); } diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index 534fbf9a7344..e18da87fe84f 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -17,6 +17,7 @@ int process(struct __sk_buff *skb) } skb->priority++; skb->tstamp++; + skb->mark++; if (skb->wire_len != 100) return 1; -- cgit v1.2.3 From 8d5a49e9e31ba1ddd34a54b2351d068a90c78707 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2019 14:12:01 -0800 Subject: net/tls: add helper for testing if socket is RX offloaded There is currently no way for driver to reliably check that the socket it has looked up is in fact RX offloaded. Add a helper. This allows drivers to catch misbehaving firmware. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 9 +++++++++ net/tls/tls_device.c | 5 +++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index df630f5fc723..bf9eb4823933 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -641,6 +641,7 @@ int tls_sw_fallback_init(struct sock *sk, #ifdef CONFIG_TLS_DEVICE void tls_device_init(void); void tls_device_cleanup(void); +void tls_device_sk_destruct(struct sock *sk); int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); @@ -649,6 +650,14 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, struct sk_buff *skb, struct strp_msg *rxm); + +static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk) +{ + if (!sk_fullsock(sk) || + smp_load_acquire(&sk->sk_destruct) != tls_device_sk_destruct) + return false; + return tls_get_ctx(sk)->rx_conf == TLS_HW; +} #else static inline void tls_device_init(void) {} static inline void tls_device_cleanup(void) {} diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index cd91ad812291..1ba5a92832bb 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -178,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ -static void tls_device_sk_destruct(struct sock *sk) +void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); @@ -196,6 +196,7 @@ static void tls_device_sk_destruct(struct sock *sk) if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); } +EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { @@ -903,7 +904,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_device_sk_destruct; + smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } -- cgit v1.2.3 From e312b9e706ed6d94f6cc9088fcd9fbd81de4525c Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:02 +0100 Subject: xsk: Make xskmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all xskmaps, which simplifies __xsk_map_flush() and xsk_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-5-bjorn.topel@gmail.com --- include/net/xdp_sock.h | 11 ++++------- kernel/bpf/xskmap.c | 18 +++--------------- net/core/filter.c | 9 ++++----- net/xdp/xsk.c | 17 +++++++++-------- 4 files changed, 20 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e3780e4b74e1..48594740d67c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -72,7 +72,6 @@ struct xdp_umem { struct xsk_map { struct bpf_map map; - struct list_head __percpu *flush_list; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; @@ -139,9 +138,8 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs); -void __xsk_map_flush(struct bpf_map *map); +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); +void __xsk_map_flush(void); static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) @@ -369,13 +367,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } -static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) +static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; } -static inline void __xsk_map_flush(struct bpf_map *map) +static inline void __xsk_map_flush(void) { } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 90c4fce1c981..2cc5c8f4c800 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct bpf_map_memory mem; - int cpu, err, numa_node; + int err, numa_node; struct xsk_map *m; - u64 cost, size; + u64 size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, size); if (err < 0) return ERR_PTR(err); @@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) { - bpf_map_charge_finish(&m->map.memory); - bpf_map_area_free(m); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - return &m->map; } @@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); - free_percpu(m->flush_list); bpf_map_area_free(m); } diff --git a/net/core/filter.c b/net/core/filter.c index a411f7835dee..c51678c473c5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3511,8 +3511,7 @@ err: static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) + struct xdp_buff *xdp) { int err; @@ -3537,7 +3536,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_XSKMAP: { struct xdp_sock *xs = fwd; - err = __xsk_map_redirect(map, xdp, xs); + err = __xsk_map_redirect(xs, xdp); return err; } default: @@ -3562,7 +3561,7 @@ void xdp_do_flush_map(void) __cpu_map_flush(map); break; case BPF_MAP_TYPE_XSKMAP: - __xsk_map_flush(map); + __xsk_map_flush(); break; default: break; @@ -3619,7 +3618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); if (unlikely(err)) goto err; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 956793893c9d..e45c27f5cfca 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,8 @@ #define TX_BATCH_SIZE 16 +static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); + bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && @@ -264,11 +266,9 @@ out_unlock: return err; } -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) +int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; err = xsk_rcv(xs, xdp); @@ -281,10 +281,9 @@ int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, return 0; } -void __xsk_map_flush(struct bpf_map *map) +void __xsk_map_flush(void) { - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -1177,7 +1176,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err; + int err, cpu; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -1195,6 +1194,8 @@ static int __init xsk_init(void) if (err) goto out_pernet; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); return 0; out_pernet: -- cgit v1.2.3 From 96360004b8628541f5d05a845ea213267db0b1a2 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:03 +0100 Subject: xdp: Make devmap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devmap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __dev_map_flush() and dev_map_init_map(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-6-bjorn.topel@gmail.com --- include/linux/bpf.h | 4 ++-- kernel/bpf/devmap.c | 35 +++++++++++++---------------------- net/core/filter.c | 2 +- 3 files changed, 16 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d467983e61bb..31191804ca09 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -959,7 +959,7 @@ struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_flush(struct bpf_map *map); +void __dev_map_flush(void); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -1068,7 +1068,7 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map return NULL; } -static inline void __dev_map_flush(struct bpf_map *map) +static inline void __dev_map_flush(void) { } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b7595de6a91a..da9c832fc5c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,7 +75,6 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ - struct list_head __percpu *flush_list; struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,6 +84,7 @@ struct bpf_dtab { u32 n_buckets; }; +static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - int err, cpu; - u64 cost; + u64 cost = 0; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); - /* make sure page count doesn't overflow */ - cost = (u64) sizeof(struct list_head) * num_possible_cpus(); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (err) return -EINVAL; - dtab->flush_list = alloc_percpu(struct list_head); - if (!dtab->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_percpu; + goto free_charge; spin_lock_init(&dtab->index_lock); } else { @@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_percpu; + goto free_charge; } return 0; -free_percpu: - free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); return -ENOMEM; @@ -254,7 +242,6 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - free_percpu(dtab->flush_list); kfree(dtab); } @@ -384,10 +371,9 @@ error: * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(struct bpf_map *map) +void __dev_map_flush(void) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -419,7 +405,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -777,10 +763,15 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + int cpu; + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index c51678c473c5..b7570cb84902 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3555,7 +3555,7 @@ void xdp_do_flush_map(void) switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: - __dev_map_flush(map); + __dev_map_flush(); break; case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); -- cgit v1.2.3 From cdfafe98cabefeedbbc65af5c191c59745c03298 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:04 +0100 Subject: xdp: Make cpumap flush_list common for all map instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpumap flush list is used to track entries that need to flushed from via the xdp_do_flush_map() function. This list used to be per-map, but there is really no reason for that. Instead make the flush list global for all devmaps, which simplifies __cpu_map_flush() and cpu_map_alloc(). Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-7-bjorn.topel@gmail.com --- include/linux/bpf.h | 4 ++-- kernel/bpf/cpumap.c | 36 ++++++++++++++++++------------------ net/core/filter.c | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 31191804ca09..8f3e00c84f39 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -966,7 +966,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); -void __cpu_map_flush(struct bpf_map *map); +void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1097,7 +1097,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } -static inline void __cpu_map_flush(struct bpf_map *map) +static inline void __cpu_map_flush(void) { } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f9deed659798..70f71b154fa5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -72,17 +72,18 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - struct list_head __percpu *flush_list; }; +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); + static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; - int ret, cpu; u64 cost; + int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - cmap->flush_list = alloc_percpu(struct list_head); - if (!cmap->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_percpu; + goto free_charge; return &cmap->map; -free_percpu: - free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -526,7 +517,6 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -618,7 +608,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -657,10 +647,9 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_flush(struct bpf_map *map) +void __cpu_map_flush(void) { - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -670,3 +659,14 @@ void __cpu_map_flush(struct bpf_map *map) wake_up_process(bq->obj->kthread); } } + +static int __init cpu_map_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); + return 0; +} + +subsys_initcall(cpu_map_init); diff --git a/net/core/filter.c b/net/core/filter.c index b7570cb84902..c706325b3e66 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3558,7 +3558,7 @@ void xdp_do_flush_map(void) __dev_map_flush(); break; case BPF_MAP_TYPE_CPUMAP: - __cpu_map_flush(map); + __cpu_map_flush(); break; case BPF_MAP_TYPE_XSKMAP: __xsk_map_flush(); -- cgit v1.2.3 From 332f22a60e4c3492d4953cd6f7aaa4e8bd0bba97 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:05 +0100 Subject: xdp: Remove map_to_flush and map swap detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all XDP maps that can be used with bpf_redirect_map() tracks entries to be flushed in a global fashion, there is not need to track that the map has changed and flush from xdp_do_generic_map() anymore. All entries will be flushed in xdp_do_flush_map(). This means that the map_to_flush can be removed, and the corresponding checks. Moving the flush logic to one place, xdp_do_flush_map(), give a bulking behavior and performance boost. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-8-bjorn.topel@gmail.com --- include/linux/filter.h | 1 - net/core/filter.c | 27 +++------------------------ 2 files changed, 3 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 37ac7025031d..69d6706fc889 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -592,7 +592,6 @@ struct bpf_redirect_info { u32 tgt_index; void *tgt_value; struct bpf_map *map; - struct bpf_map *map_to_flush; u32 kern_flags; }; diff --git a/net/core/filter.c b/net/core/filter.c index c706325b3e66..d9caa3e57ea1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3547,26 +3547,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = ri->map_to_flush; - - ri->map_to_flush = NULL; - if (map) { - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - __dev_map_flush(); - break; - case BPF_MAP_TYPE_CPUMAP: - __cpu_map_flush(); - break; - case BPF_MAP_TYPE_XSKMAP: - __xsk_map_flush(); - break; - default: - break; - } - } + __dev_map_flush(); + __cpu_map_flush(); + __xsk_map_flush(); } EXPORT_SYMBOL_GPL(xdp_do_flush_map); @@ -3615,14 +3598,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) - xdp_do_flush_map(); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); if (unlikely(err)) goto err; - ri->map_to_flush = map; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: -- cgit v1.2.3 From 1170beaa3fa3c3381fd820e9d05b84d168fe1dab Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 19 Dec 2019 07:10:06 +0100 Subject: xdp: Simplify __bpf_tx_xdp_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The explicit error checking is not needed. Simply return the error instead. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-9-bjorn.topel@gmail.com --- net/core/filter.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d9caa3e57ea1..217af9974c86 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3510,35 +3510,16 @@ err: } static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, - struct xdp_buff *xdp) + struct bpf_map *map, struct xdp_buff *xdp) { - int err; - switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: { - struct bpf_dtab_netdev *dst = fwd; - - err = dev_map_enqueue(dst, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_CPUMAP: { - struct bpf_cpu_map_entry *rcpu = fwd; - - err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (unlikely(err)) - return err; - break; - } - case BPF_MAP_TYPE_XSKMAP: { - struct xdp_sock *xs = fwd; - - err = __xsk_map_redirect(xs, xdp); - return err; - } + case BPF_MAP_TYPE_DEVMAP_HASH: + return dev_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_CPUMAP: + return cpu_map_enqueue(fwd, xdp, dev_rx); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_redirect(fwd, xdp); default: break; } -- cgit v1.2.3 From 484b165306e18fd9a0d960e6ed2d6eff31665522 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:20 +0100 Subject: xsk: Eliminate the lazy update threshold The lazy update threshold was introduced to keep the producer and consumer some distance apart in the completion ring. This was important in the beginning of the development of AF_XDP as the ring format as that point in time was very sensitive to the producer and consumer being on the same cache line. This is not the case anymore as the current ring format does not degrade in any noticeable way when this happens. Moreover, this threshold makes it impossible to run the system with rings that have less than 128 entries. So let us remove this threshold and just get one entry from the ring as in all other functions. This will enable us to remove this function in a later commit. Note that xskq_produce_addr_lazy followed by xskq_produce_flush_addr_n are still not the same function as xskq_produce_addr() as it operates on another cached pointer. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-2-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index eddae4688862..a2f0ba65559f 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -11,7 +11,6 @@ #include #define RX_BATCH_SIZE 16 -#define LAZY_UPDATE_THRESHOLD 128 struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -239,7 +238,7 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) + if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; /* A, matches D */ -- cgit v1.2.3 From 11cc2d21499cabe7e7964389634ed1de3ee91d33 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:21 +0100 Subject: xsk: Simplify detection of empty and full rings In order to set the correct return flags for poll, the xsk code has to check if the Rx queue is empty and if the Tx queue is full. This code was unnecessarily large and complex as it used the functions that are used to update the local state from the global state (xskq_nb_free and xskq_nb_avail). Since we are not doing this nor updating any data dependent on this state, we can simplify the functions. Another benefit from this is that we can also simplify the xskq_nb_free and xskq_nb_avail functions in a later commit. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-3-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a2f0ba65559f..b28f9da7b317 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -362,12 +362,15 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return xskq_nb_avail(q, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); -- cgit v1.2.3 From d7012f05e3ca5aba92770c370895092d4882e8c2 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:22 +0100 Subject: xsk: Consolidate to one single cached producer pointer Currently, the xsk ring code has two cached producer pointers: prod_head and prod_tail. This patch consolidates these two into a single one called cached_prod to make the code simpler and easier to maintain. This will be in line with the user space part of the the code found in libbpf, that only uses a single cached pointer. The Rx path only uses the two top level functions xskq_produce_batch_desc and xskq_produce_flush_desc and they both use prod_head and never prod_tail. So just move them over to cached_prod. The Tx XDP_DRV path uses xskq_produce_addr_lazy and xskq_produce_flush_addr_n and unnecessarily operates on both prod_tail and prod_head, so move them over to just use cached_prod by skipping the intermediate step of updating prod_tail. The Tx path in XDP_SKB mode uses xskq_reserve_addr and xskq_produce_addr. They currently use both cached pointers, but we can operate on the global producer pointer in xskq_produce_addr since it has to be updated anyway, thus eliminating the use of both cached pointers. We can also remove the xskq_nb_free in xskq_produce_addr since it is already called in xskq_reserve_addr. No need to do it twice. When there is only one cached producer pointer, we can also simplify xskq_nb_free by removing one argument. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-4-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b28f9da7b317..7ad80748f209 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -35,8 +35,7 @@ struct xsk_queue { u64 size; u32 ring_mask; u32 nentries; - u32 prod_head; - u32 prod_tail; + u32 cached_prod; u32 cons_head; u32 cons_tail; struct xdp_ring *ring; @@ -94,39 +93,39 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) { - u32 entries = q->prod_tail - q->cons_tail; + u32 entries = q->cached_prod - q->cons_tail; if (entries == 0) { /* Refresh the local pointer */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; + q->cached_prod = READ_ONCE(q->ring->producer); + entries = q->cached_prod - q->cons_tail; } return (entries > dcnt) ? dcnt : entries; } -static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 dcnt) { - u32 free_entries = q->nentries - (producer - q->cons_tail); + u32 free_entries = q->nentries - (q->cached_prod - q->cons_tail); if (free_entries >= dcnt) return free_entries; /* Refresh the local tail pointer */ q->cons_tail = READ_ONCE(q->ring->consumer); - return q->nentries - (producer - q->cons_tail); + return q->nentries - (q->cached_prod - q->cons_tail); } static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) { - u32 entries = q->prod_tail - q->cons_tail; + u32 entries = q->cached_prod - q->cons_tail; if (entries >= cnt) return true; /* Refresh the local pointer. */ - q->prod_tail = READ_ONCE(q->ring->producer); - entries = q->prod_tail - q->cons_tail; + q->cached_prod = READ_ONCE(q->ring->producer); + entries = q->cached_prod - q->cons_tail; return entries >= cnt; } @@ -220,17 +219,15 @@ static inline void xskq_discard_addr(struct xsk_queue *q) static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - if (xskq_nb_free(q, q->prod_tail, 1) == 0) - return -ENOSPC; + unsigned int idx = q->ring->producer; /* A, matches D */ - ring->desc[q->prod_tail++ & q->ring_mask] = addr; + ring->desc[idx++ & q->ring_mask] = addr; /* Order producer and data */ smp_wmb(); /* B, matches C */ - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, idx); return 0; } @@ -238,11 +235,11 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_nb_free(q, 1) == 0) return -ENOSPC; /* A, matches D */ - ring->desc[q->prod_head++ & q->ring_mask] = addr; + ring->desc[q->cached_prod++ & q->ring_mask] = addr; return 0; } @@ -252,17 +249,16 @@ static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail += nb_entries; - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, q->ring->producer + nb_entries); } static inline int xskq_reserve_addr(struct xsk_queue *q) { - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_nb_free(q, 1) == 0) return -ENOSPC; /* A, matches D */ - q->prod_head++; + q->cached_prod++; return 0; } @@ -340,11 +336,11 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx; - if (xskq_nb_free(q, q->prod_head, 1) == 0) + if (xskq_nb_free(q, 1) == 0) return -ENOSPC; /* A, matches D */ - idx = (q->prod_head++) & q->ring_mask; + idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; @@ -356,8 +352,7 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) /* Order producer and data */ smp_wmb(); /* B, matches C */ - q->prod_tail = q->prod_head; - WRITE_ONCE(q->ring->producer, q->prod_tail); + WRITE_ONCE(q->ring->producer, q->cached_prod); } static inline bool xskq_full_desc(struct xsk_queue *q) -- cgit v1.2.3 From 59e35e552529b858f35b30bc5a803ea532ca17f1 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:23 +0100 Subject: xsk: Standardize naming of producer ring access functions Adopt the naming of the producer ring access functions to have a similar naming convention as the functions in libbpf, but adapted to the kernel. You first reserve a number of entries that you later submit to the global state of the ring. This is much clearer, IMO, than the one that was in the kernel part. Once renamed, we also discover that two functions are actually the same, so remove one of them. Some of the primitive ring submission operations are also the same so break these out into __xskq_prod_submit that the upper level ring access functions can use. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-5-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 20 +++++++++--------- net/xdp/xsk_queue.h | 58 +++++++++++++++++++++++++---------------------------- 2 files changed, 37 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index e45c27f5cfca..5f2123fe630c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -167,7 +167,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) offset += metalen; addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_produce_batch_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); xdp_return_buff(xdp); @@ -180,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); + int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); if (err) xs->rx_dropped++; @@ -216,7 +216,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { - xskq_produce_flush_desc(xs->rx); + xskq_prod_submit(xs->rx); xs->sk.sk_data_ready(&xs->sk); } @@ -247,12 +247,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) memcpy(buffer, xdp->data_meta, len + metalen); addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_produce_batch_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) goto out_drop; xskq_discard_addr(xs->umem->fq); - xskq_produce_flush_desc(xs->rx); + xskq_prod_submit(xs->rx); spin_unlock_bh(&xs->rx_lock); @@ -294,7 +294,7 @@ void __xsk_map_flush(void) void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { - xskq_produce_flush_addr_n(umem->cq, nb_entries); + xskq_prod_submit_n(umem->cq, nb_entries); } EXPORT_SYMBOL(xsk_umem_complete_tx); @@ -319,7 +319,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) if (!xskq_peek_desc(xs->tx, desc, umem)) continue; - if (xskq_produce_addr_lazy(umem->cq, desc->addr)) + if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; xskq_discard_desc(xs->tx); @@ -348,7 +348,7 @@ static void xsk_destruct_skb(struct sk_buff *skb) unsigned long flags; spin_lock_irqsave(&xs->tx_completion_lock, flags); - WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + xskq_prod_submit_addr(xs->umem->cq, addr); spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); @@ -389,7 +389,7 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { + if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { kfree_skb(skb); goto out; } @@ -470,7 +470,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, __xsk_sendmsg(sk); } - if (xs->rx && !xskq_empty_desc(xs->rx)) + if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && !xskq_full_desc(xs->tx)) mask |= EPOLLOUT | EPOLLWRNORM; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7ad80748f209..1b9a350f2e66 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -216,22 +216,17 @@ static inline void xskq_discard_addr(struct xsk_queue *q) q->cons_tail++; } -static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) +static inline int xskq_prod_reserve(struct xsk_queue *q) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - unsigned int idx = q->ring->producer; + if (xskq_nb_free(q, 1) == 0) + return -ENOSPC; /* A, matches D */ - ring->desc[idx++ & q->ring_mask] = addr; - - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); + q->cached_prod++; return 0; } -static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -243,23 +238,32 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) return 0; } -static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, - u32 nb_entries) +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { /* Order producer and data */ smp_wmb(); /* B, matches C */ - WRITE_ONCE(q->ring->producer, q->ring->producer + nb_entries); + WRITE_ONCE(q->ring->producer, idx); } -static inline int xskq_reserve_addr(struct xsk_queue *q) +static inline void xskq_prod_submit(struct xsk_queue *q) { - if (xskq_nb_free(q, 1) == 0) - return -ENOSPC; + __xskq_prod_submit(q, q->cached_prod); +} - /* A, matches D */ - q->cached_prod++; - return 0; +static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = q->ring->producer; + + ring->desc[idx++ & q->ring_mask] = addr; + + __xskq_prod_submit(q, idx); +} + +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) +{ + __xskq_prod_submit(q, q->ring->producer + nb_entries); } /* Rx/Tx queue */ @@ -330,11 +334,11 @@ static inline void xskq_discard_desc(struct xsk_queue *q) q->cons_tail++; } -static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u64 addr, u32 len) +static inline int xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx; + u32 idx; if (xskq_nb_free(q, 1) == 0) return -ENOSPC; @@ -347,14 +351,6 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, return 0; } -static inline void xskq_produce_flush_desc(struct xsk_queue *q) -{ - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, q->cached_prod); -} - static inline bool xskq_full_desc(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -362,7 +358,7 @@ static inline bool xskq_full_desc(struct xsk_queue *q) q->nentries; } -static inline bool xskq_empty_desc(struct xsk_queue *q) +static inline bool xskq_prod_is_empty(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); -- cgit v1.2.3 From 4b638f13bab4dbfe8569c84e374e8201a427115c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:24 +0100 Subject: xsk: Eliminate the RX batch size In the xsk consumer ring code there is a variable called RX_BATCH_SIZE that dictates the minimum number of entries that we try to grab from the fill and Tx rings. In fact, the code always try to grab the maximum amount of entries from these rings. The only thing this variable does is to throw an error if there is less than 16 (as it is defined) entries on the ring. There is no reason to do this and it will just lead to weird behavior from user space's point of view. So eliminate this variable. With this change, we will be able to simplify the xskq_nb_free and xskq_nb_avail code in the next commit. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-6-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 1b9a350f2e66..6ca5fed87852 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -10,8 +10,6 @@ #include #include -#define RX_BATCH_SIZE 16 - struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; @@ -202,7 +200,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); /* Order consumer and data */ smp_rmb(); @@ -320,7 +318,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); /* Order consumer and data */ smp_rmb(); /* C, matches B */ -- cgit v1.2.3 From df0ae6f78a45e5696427779fc3379c5d75f5d4a5 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:25 +0100 Subject: xsk: Simplify xskq_nb_avail and xskq_nb_free At this point, there are no users of the functions xskq_nb_avail and xskq_nb_free that take any other number of entries argument than 1, so let us get rid of the second argument that takes the number of entries. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-7-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6ca5fed87852..8bfa2ee6864c 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -89,7 +89,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +static inline u32 xskq_nb_avail(struct xsk_queue *q) { u32 entries = q->cached_prod - q->cons_tail; @@ -99,19 +99,21 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) entries = q->cached_prod - q->cons_tail; } - return (entries > dcnt) ? dcnt : entries; + return entries; } -static inline u32 xskq_nb_free(struct xsk_queue *q, u32 dcnt) +static inline bool xskq_prod_is_full(struct xsk_queue *q) { u32 free_entries = q->nentries - (q->cached_prod - q->cons_tail); - if (free_entries >= dcnt) - return free_entries; + if (free_entries) + return false; /* Refresh the local tail pointer */ q->cons_tail = READ_ONCE(q->ring->consumer); - return q->nentries - (q->cached_prod - q->cons_tail); + free_entries = q->nentries - (q->cached_prod - q->cons_tail); + + return !free_entries; } static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) @@ -200,7 +202,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); + q->cons_head = q->cons_tail + xskq_nb_avail(q); /* Order consumer and data */ smp_rmb(); @@ -216,7 +218,7 @@ static inline void xskq_discard_addr(struct xsk_queue *q) static inline int xskq_prod_reserve(struct xsk_queue *q) { - if (xskq_nb_free(q, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ @@ -228,7 +230,7 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ @@ -318,7 +320,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q, 1); + q->cons_head = q->cons_tail + xskq_nb_avail(q); /* Order consumer and data */ smp_rmb(); /* C, matches B */ @@ -338,7 +340,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; - if (xskq_nb_free(q, 1) == 0) + if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ -- cgit v1.2.3 From c5ed924b54c892ee637d2e6889ef83341835a560 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:26 +0100 Subject: xsk: Simplify the consumer ring access functions Simplify and refactor consumer ring functions. The consumer first "peeks" to find descriptors or addresses that are available to read from the ring, then reads them and finally "releases" these descriptors once it is done. The two local variables cons_tail and cons_head are turned into one single variable called cached_cons. cached_tail referred to the cached value of the global consumer pointer and will be stored in cached_cons. For cached_head, we just use cached_prod instead as it was not used for a consumer queue before. It also better reflects what it really is now: a cached copy of the producer pointer. The names of the functions are also renamed in the same manner as the producer functions. The new functions are called xskq_cons_ followed by what it does. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-8-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 24 ++++++------- net/xdp/xsk_queue.h | 102 ++++++++++++++++++++++++---------------------------- 2 files changed, 58 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5f2123fe630c..cd84b9d0f1e1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,19 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) { - return xskq_has_addrs(umem->fq, cnt); + return xskq_cons_has_entries(umem->fq, cnt); } EXPORT_SYMBOL(xsk_umem_has_addrs); u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { - return xskq_peek_addr(umem->fq, addr, umem); + return xskq_cons_peek_addr(umem->fq, addr, umem); } EXPORT_SYMBOL(xsk_umem_peek_addr); void xsk_umem_discard_addr(struct xdp_umem *umem) { - xskq_discard_addr(umem->fq); + xskq_cons_release(umem->fq); } EXPORT_SYMBOL(xsk_umem_discard_addr); @@ -148,7 +148,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) u32 metalen; int err; - if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; @@ -169,7 +169,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xsk_umem_adjust_offset(xs->umem, addr, offset); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (!err) { - xskq_discard_addr(xs->umem->fq); + xskq_cons_release(xs->umem->fq); xdp_return_buff(xdp); return 0; } @@ -236,7 +236,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) goto out_unlock; } - if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || + if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { err = -ENOSPC; goto out_drop; @@ -251,7 +251,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) if (err) goto out_drop; - xskq_discard_addr(xs->umem->fq); + xskq_cons_release(xs->umem->fq); xskq_prod_submit(xs->rx); spin_unlock_bh(&xs->rx_lock); @@ -316,13 +316,13 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } @@ -368,7 +368,7 @@ static int xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - while (xskq_peek_desc(xs->tx, &desc, xs->umem)) { + while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) { char *buffer; u64 addr; u32 len; @@ -401,7 +401,7 @@ static int xsk_generic_xmit(struct sock *sk) skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); - xskq_discard_desc(xs->tx); + xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { /* SKB completed but not sent */ @@ -472,7 +472,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_full_desc(xs->tx)) + if (xs->tx && !xskq_cons_is_full(xs->tx)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 8bfa2ee6864c..1436116767ea 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -34,8 +34,7 @@ struct xsk_queue { u32 ring_mask; u32 nentries; u32 cached_prod; - u32 cons_head; - u32 cons_tail; + u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; }; @@ -89,43 +88,48 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -static inline u32 xskq_nb_avail(struct xsk_queue *q) +static inline void __xskq_cons_release(struct xsk_queue *q) { - u32 entries = q->cached_prod - q->cons_tail; + smp_mb(); /* D, matches A */ + WRITE_ONCE(q->ring->consumer, q->cached_cons); +} - if (entries == 0) { - /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - entries = q->cached_prod - q->cons_tail; - } +static inline void __xskq_cons_peek(struct xsk_queue *q) +{ + /* Refresh the local pointer */ + q->cached_prod = READ_ONCE(q->ring->producer); + smp_rmb(); /* C, matches B */ +} - return entries; +static inline void xskq_cons_get_entries(struct xsk_queue *q) +{ + __xskq_cons_release(q); + __xskq_cons_peek(q); } static inline bool xskq_prod_is_full(struct xsk_queue *q) { - u32 free_entries = q->nentries - (q->cached_prod - q->cons_tail); + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); if (free_entries) return false; /* Refresh the local tail pointer */ - q->cons_tail = READ_ONCE(q->ring->consumer); - free_entries = q->nentries - (q->cached_prod - q->cons_tail); + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); return !free_entries; } -static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) { - u32 entries = q->cached_prod - q->cons_tail; + u32 entries = q->cached_prod - q->cached_cons; if (entries >= cnt) return true; - /* Refresh the local pointer. */ - q->cached_prod = READ_ONCE(q->ring->producer); - entries = q->cached_prod - q->cons_tail; + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; return entries >= cnt; } @@ -172,9 +176,10 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, struct xdp_umem *umem) { - while (q->cons_tail != q->cons_head) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; @@ -190,30 +195,27 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, return addr; out: - q->cons_tail++; + q->cached_cons++; } return NULL; } -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline u64 *xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q); - - /* Order consumer and data */ - smp_rmb(); - } - + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); return xskq_validate_addr(q, addr, umem); } -static inline void xskq_discard_addr(struct xsk_queue *q) +static inline void xskq_cons_release(struct xsk_queue *q) { - q->cons_tail++; + /* To improve performance, only update local state here. + * Do the actual release operation when we get new entries + * from the ring in xskq_cons_get_entries() instead. + */ + q->cached_cons++; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -299,41 +301,29 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) { - while (q->cons_tail != q->cons_head) { + while (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; - unsigned int idx = q->cons_tail & q->ring_mask; + u32 idx = q->cached_cons & q->ring_mask; *desc = READ_ONCE(ring->desc[idx]); if (xskq_is_valid_desc(q, desc, umem)) return desc; - q->cons_tail++; + q->cached_cons++; } return NULL; } -static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline struct xdp_desc *xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { - if (q->cons_tail == q->cons_head) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cons_tail); - q->cons_head = q->cons_tail + xskq_nb_avail(q); - - /* Order consumer and data */ - smp_rmb(); /* C, matches B */ - } - + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); return xskq_validate_desc(q, desc, umem); } -static inline void xskq_discard_desc(struct xsk_queue *q) -{ - q->cons_tail++; -} - static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { @@ -351,7 +341,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, return 0; } -static inline bool xskq_full_desc(struct xsk_queue *q) +static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == -- cgit v1.2.3 From 03896ef1f0cb23d2742ddf486c531c700a2da7d6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:27 +0100 Subject: xsk: Change names of validation functions Change the names of the validation functions to better reflect what they are doing. The uppermost ones are reading entries from the rings and only the bottom ones validate entries. So xskq_cons_read_ is a better prefix name. Also change the xskq_cons_read_ functions to return a bool as the the descriptor or address is already returned by reference in the parameters. Everyone is using the return value as a bool anyway. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-9-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 ++-- net/xdp/xsk.c | 4 ++-- net/xdp/xsk_queue.h | 59 ++++++++++++++++++++++++++------------------------ 3 files changed, 35 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 48594740d67c..63f005830866 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -118,7 +118,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); @@ -197,7 +197,7 @@ static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) return xsk_umem_has_addrs(umem, cnt - rq->length); } -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd84b9d0f1e1..4e932a930b3a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -45,7 +45,7 @@ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) } EXPORT_SYMBOL(xsk_umem_has_addrs); -u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { return xskq_cons_peek_addr(umem->fq, addr, umem); } @@ -126,7 +126,7 @@ static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, void *to_buf = xdp_umem_get_data(umem, addr); addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) { + if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; u64 page_start = addr & ~(PAGE_SIZE - 1); u64 first_len = PAGE_SIZE - (addr - page_start); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 1436116767ea..6d04a96dd092 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -136,8 +136,9 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) /* UMEM queue */ -static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, - u64 length) +static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, + u64 addr, + u64 length) { bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; bool next_pg_contig = @@ -147,7 +148,7 @@ static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, return cross_pg && !next_pg_contig; } -static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) +static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) { if (addr >= q->size) { q->invalid_descs++; @@ -157,7 +158,8 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) return true; } -static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, +static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, + u64 addr, u64 length, struct xdp_umem *umem) { @@ -165,7 +167,7 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, addr = xsk_umem_add_offset_to_addr(addr); if (base_addr >= q->size || addr >= q->size || - xskq_crosses_non_contig_pg(umem, addr, length)) { + xskq_cons_crosses_non_contig_pg(umem, addr, length)) { q->invalid_descs++; return false; } @@ -173,8 +175,8 @@ static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, return true; } -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -184,29 +186,29 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_is_valid_addr_unaligned(q, *addr, + if (xskq_cons_is_valid_unaligned(q, *addr, umem->chunk_size_nohr, umem)) - return addr; + return true; goto out; } - if (xskq_is_valid_addr(q, *addr)) - return addr; + if (xskq_cons_is_valid_addr(q, *addr)) + return true; out: q->cached_cons++; } - return NULL; + return false; } -static inline u64 *xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, +static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, struct xdp_umem *umem) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_validate_addr(q, addr, umem); + return xskq_cons_read_addr(q, addr, umem); } static inline void xskq_cons_release(struct xsk_queue *q) @@ -270,11 +272,12 @@ static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) /* Rx/Tx queue */ -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, - struct xdp_umem *umem) +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) { if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem)) + if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) return false; if (d->len > umem->chunk_size_nohr || d->options) { @@ -285,7 +288,7 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, return true; } - if (!xskq_is_valid_addr(q, d->addr)) + if (!xskq_cons_is_valid_addr(q, d->addr)) return false; if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || @@ -297,31 +300,31 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, return true; } -static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline bool xskq_cons_read_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { while (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_desc(q, desc, umem)) - return desc; + if (xskq_cons_is_valid_desc(q, desc, umem)) + return true; q->cached_cons++; } - return NULL; + return false; } -static inline struct xdp_desc *xskq_cons_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc, - struct xdp_umem *umem) +static inline bool xskq_cons_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc, + struct xdp_umem *umem) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_validate_desc(q, desc, umem); + return xskq_cons_read_desc(q, desc, umem); } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, -- cgit v1.2.3 From f8509aa078de0842ec1817e8026e58620cd05d3b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:28 +0100 Subject: xsk: ixgbe: i40e: ice: mlx5: Xsk_umem_discard_addr to xsk_umem_release_addr Change the name of xsk_umem_discard_addr to xsk_umem_release_addr to better reflect the new naming of the AF_XDP queue manipulation functions. As this functions is used by drivers implementing support for AF_XDP zero-copy, it requires a name change to these drivers. The function xsk_umem_release_addr_rq has also changed name in the same fashion. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-10-git-send-email-magnus.karlsson@intel.com --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_xsk.c | 4 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c | 2 +- include/net/xdp_sock.h | 10 +++++----- net/xdp/xsk.c | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index d07e1a890428..20d6f11a3468 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -269,7 +269,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); return true; } @@ -306,7 +306,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); return true; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index cf9b8b22d24f..0af1f0b951c0 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -555,7 +555,7 @@ ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) rx_buf->handle = handle + umem->headroom; - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); return true; } @@ -591,7 +591,7 @@ ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) rx_buf->handle = handle + umem->headroom; - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); return true; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index d6feaacfbf89..1501d0a22078 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -277,7 +277,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); return true; } @@ -304,7 +304,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring, bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); return true; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 475b6bd5d29b..62fc8a128a8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -35,7 +35,7 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, */ dma_info->addr = xdp_umem_get_dma(umem, handle); - xsk_umem_discard_addr_rq(umem); + xsk_umem_release_addr_rq(umem); dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE, DMA_BIDIRECTIONAL); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 63f005830866..e86ec48ef627 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -119,7 +119,7 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_release_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); @@ -208,12 +208,12 @@ static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) return addr; } -static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; if (!rq->length) - xsk_umem_discard_addr(umem); + xsk_umem_release_addr(umem); else rq->length--; } @@ -258,7 +258,7 @@ static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) return NULL; } -static inline void xsk_umem_discard_addr(struct xdp_umem *umem) +static inline void xsk_umem_release_addr(struct xdp_umem *umem) { } @@ -332,7 +332,7 @@ static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) return NULL; } -static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) { } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e932a930b3a..55092feeaf36 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -51,11 +51,11 @@ bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) } EXPORT_SYMBOL(xsk_umem_peek_addr); -void xsk_umem_discard_addr(struct xdp_umem *umem) +void xsk_umem_release_addr(struct xdp_umem *umem) { xskq_cons_release(umem->fq); } -EXPORT_SYMBOL(xsk_umem_discard_addr); +EXPORT_SYMBOL(xsk_umem_release_addr); void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { -- cgit v1.2.3 From c34787fcc90f0732ff00754057f11780007002e4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:29 +0100 Subject: xsk: Remove unnecessary READ_ONCE of data There are two unnecessary READ_ONCE of descriptor data. These are not needed since the data is written by the producer before it signals that the data is available by incrementing the producer pointer. As the access to this producer pointer is serialized and the consumer always reads the descriptor after it has read and synchronized with the producer counter, the write of the descriptor will have fully completed and it does not matter if the consumer has any read tearing. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-11-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d04a96dd092..4c049cacc693 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -183,7 +183,7 @@ static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, while (q->cached_cons != q->cached_prod) { u32 idx = q->cached_cons & q->ring_mask; - *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; + *addr = ring->desc[idx] & q->chunk_mask; if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { if (xskq_cons_is_valid_unaligned(q, *addr, @@ -308,7 +308,7 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; - *desc = READ_ONCE(ring->desc[idx]); + *desc = ring->desc[idx]; if (xskq_cons_is_valid_desc(q, desc, umem)) return true; -- cgit v1.2.3 From 15d8c9162ced1789d25261859a7b37db8426e409 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:30 +0100 Subject: xsk: Add function naming comments and reorder functions Add comments on how the ring access functions are named and how they are supposed to be used for producers and consumers. The functions are also reordered so that the consumer functions are in the beginning and the producer functions in the end, for easier reference. Put this in a separate patch as the diff might look a little odd, but no functionality has changed in this patch. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-12-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 10 ++ net/xdp/xsk_queue.h | 293 ++++++++++++++++++++++++++++------------------------ 2 files changed, 167 insertions(+), 136 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 55092feeaf36..5560d60c1ff5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -319,6 +319,11 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ if (xskq_prod_reserve_addr(umem->cq, desc->addr)) goto out; @@ -389,6 +394,11 @@ static int xsk_generic_xmit(struct sock *sk) addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); + /* This is the backpreassure mechanism for the Tx path. + * Reserve space in the completion queue and only proceed + * if there is space in it. This avoids having to implement + * any buffering in the Tx path. + */ if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) { kfree_skb(skb); goto out; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 4c049cacc693..bec2af11853a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -81,60 +81,27 @@ struct xsk_queue { * now and again after circling through the ring. */ -/* Common functions operating for both RXTX and umem queues */ - -static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) -{ - return q ? q->invalid_descs : 0; -} - -static inline void __xskq_cons_release(struct xsk_queue *q) -{ - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cached_cons); -} - -static inline void __xskq_cons_peek(struct xsk_queue *q) -{ - /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - smp_rmb(); /* C, matches B */ -} - -static inline void xskq_cons_get_entries(struct xsk_queue *q) -{ - __xskq_cons_release(q); - __xskq_cons_peek(q); -} - -static inline bool xskq_prod_is_full(struct xsk_queue *q) -{ - u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - - if (free_entries) - return false; - - /* Refresh the local tail pointer */ - q->cached_cons = READ_ONCE(q->ring->consumer); - free_entries = q->nentries - (q->cached_prod - q->cached_cons); - - return !free_entries; -} - -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) -{ - u32 entries = q->cached_prod - q->cached_cons; - - if (entries >= cnt) - return true; - - __xskq_cons_peek(q); - entries = q->cached_prod - q->cached_cons; - - return entries >= cnt; -} +/* The operations on the rings are the following: + * + * producer consumer + * + * RESERVE entries PEEK in the ring for entries + * WRITE data into the ring READ data from the ring + * SUBMIT entries RELEASE entries + * + * The producer reserves one or more entries in the ring. It can then + * fill in these entries and finally submit them so that they can be + * seen and read by the consumer. + * + * The consumer peeks into the ring to see if the producer has written + * any new entries. If so, the producer can then read these entries + * and when it is done reading them release them back to the producer + * so that the producer can use these slots to fill in new entries. + * + * The function names below reflect these operations. + */ -/* UMEM queue */ +/* Functions that read and validate content from consumer rings. */ static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, @@ -148,16 +115,6 @@ static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, return cross_pg && !next_pg_contig; } -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) -{ - if (addr >= q->size) { - q->invalid_descs++; - return false; - } - - return true; -} - static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, u64 addr, u64 length, @@ -175,6 +132,16 @@ static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, return true; } +static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) +{ + if (addr >= q->size) { + q->invalid_descs++; + return false; + } + + return true; +} + static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, struct xdp_umem *umem) { @@ -203,75 +170,6 @@ out: return false; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); -} - -static inline void xskq_cons_release(struct xsk_queue *q) -{ - /* To improve performance, only update local state here. - * Do the actual release operation when we get new entries - * from the ring in xskq_cons_get_entries() instead. - */ - q->cached_cons++; -} - -static inline int xskq_prod_reserve(struct xsk_queue *q) -{ - if (xskq_prod_is_full(q)) - return -ENOSPC; - - /* A, matches D */ - q->cached_prod++; - return 0; -} - -static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - if (xskq_prod_is_full(q)) - return -ENOSPC; - - /* A, matches D */ - ring->desc[q->cached_prod++ & q->ring_mask] = addr; - return 0; -} - -static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) -{ - /* Order producer and data */ - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); -} - -static inline void xskq_prod_submit(struct xsk_queue *q) -{ - __xskq_prod_submit(q, q->cached_prod); -} - -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - -static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) -{ - __xskq_prod_submit(q, q->ring->producer + nb_entries); -} - -/* Rx/Tx queue */ - static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) @@ -318,6 +216,48 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +/* Functions for consumers */ + +static inline void __xskq_cons_release(struct xsk_queue *q) +{ + smp_mb(); /* D, matches A */ + WRITE_ONCE(q->ring->consumer, q->cached_cons); +} + +static inline void __xskq_cons_peek(struct xsk_queue *q) +{ + /* Refresh the local pointer */ + q->cached_prod = READ_ONCE(q->ring->producer); + smp_rmb(); /* C, matches B */ +} + +static inline void xskq_cons_get_entries(struct xsk_queue *q) +{ + __xskq_cons_release(q); + __xskq_cons_peek(q); +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries >= cnt) + return true; + + __xskq_cons_peek(q); + entries = q->cached_prod - q->cached_cons; + + return entries >= cnt; +} + +static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr(q, addr, umem); +} + static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) @@ -327,6 +267,60 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, umem); } +static inline void xskq_cons_release(struct xsk_queue *q) +{ + /* To improve performance, only update local state here. + * Reflect this to global state when we get new entries + * from the ring in xskq_cons_get_entries(). + */ + q->cached_cons++; +} + +static inline bool xskq_cons_is_full(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; +} + +/* Functions for producers */ + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + if (free_entries) + return false; + + /* Refresh the local tail pointer */ + q->cached_cons = READ_ONCE(q->ring->consumer); + free_entries = q->nentries - (q->cached_prod - q->cached_cons); + + return !free_entries; +} + +static inline int xskq_prod_reserve(struct xsk_queue *q) +{ + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + q->cached_prod++; + return 0; +} + +static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (xskq_prod_is_full(q)) + return -ENOSPC; + + /* A, matches D */ + ring->desc[q->cached_prod++ & q->ring_mask] = addr; + return 0; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { @@ -344,11 +338,31 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, return 0; } -static inline bool xskq_cons_is_full(struct xsk_queue *q) +static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - /* No barriers needed since data is not accessed */ - return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == - q->nentries; + smp_wmb(); /* B, matches C */ + + WRITE_ONCE(q->ring->producer, idx); +} + +static inline void xskq_prod_submit(struct xsk_queue *q) +{ + __xskq_prod_submit(q, q->cached_prod); +} + +static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = q->ring->producer; + + ring->desc[idx++ & q->ring_mask] = addr; + + __xskq_prod_submit(q, idx); +} + +static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) +{ + __xskq_prod_submit(q, q->ring->producer + nb_entries); } static inline bool xskq_prod_is_empty(struct xsk_queue *q) @@ -357,6 +371,13 @@ static inline bool xskq_prod_is_empty(struct xsk_queue *q) return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } +/* For both producers and consumers */ + +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -- cgit v1.2.3 From 1d9cb1f381860b529edec57cf7a08133f40366eb Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:31 +0100 Subject: xsk: Use struct_size() helper Improve readability and maintainability by using the struct_size() helper when allocating the AF_XDP rings. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-13-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk_queue.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index b66504592d9b..c90e9c1e3c63 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -18,14 +18,14 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) q->chunk_mask = chunk_mask; } -static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); -} + struct xdp_umem_ring *umem_ring; + struct xdp_rxtx_ring *rxtx_ring; -static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) -{ - return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); + if (umem_queue) + return struct_size(umem_ring, desc, q->nentries); + return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) @@ -43,8 +43,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = umem_queue ? xskq_umem_get_ring_size(q) : - xskq_rxtx_get_ring_size(q); + size = xskq_get_ring_size(q, umem_queue); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); -- cgit v1.2.3 From 48fda74f0a9377ce2145ac5f06b6db0274880826 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 18 Dec 2019 09:02:14 +0100 Subject: net: dsa: add support for Atheros AR9331 TAG format Add support for tag format used in Atheros AR9331 built-in switch. Reviewed-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ net/dsa/Kconfig | 6 ++++ net/dsa/Makefile | 1 + net/dsa/tag_ar9331.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+) create mode 100644 net/dsa/tag_ar9331.c (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6767dc3f66c0..da5578db228e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -43,6 +43,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 #define DSA_TAG_PROTO_OCELOT_VALUE 15 +#define DSA_TAG_PROTO_AR9331_VALUE 16 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -61,6 +62,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, + DSA_TAG_PROTO_AR9331 = DSA_TAG_PROTO_AR9331_VALUE, }; struct packet_type; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 1e6c3cac11e6..92663dcb3aa2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -29,6 +29,12 @@ config NET_DSA_TAG_8021Q Drivers which use these helpers should select this as dependency. +config NET_DSA_TAG_AR9331 + tristate "Tag driver for Atheros AR9331 SoC with built-in switch" + help + Say Y or M if you want to enable support for tagging frames for + the Atheros AR9331 SoC with built-in switch. + config NET_DSA_TAG_BRCM_COMMON tristate default n diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 9a482c38bdb1..108486cfdeef 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -5,6 +5,7 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o # tagging formats obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o +obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c new file mode 100644 index 000000000000..466ffa92a474 --- /dev/null +++ b/net/dsa/tag_ar9331.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Pengutronix, Oleksij Rempel + */ + + +#include +#include + +#include "dsa_priv.h" + +#define AR9331_HDR_LEN 2 +#define AR9331_HDR_VERSION 1 + +#define AR9331_HDR_VERSION_MASK GENMASK(15, 14) +#define AR9331_HDR_PRIORITY_MASK GENMASK(13, 12) +#define AR9331_HDR_TYPE_MASK GENMASK(10, 8) +#define AR9331_HDR_BROADCAST BIT(7) +#define AR9331_HDR_FROM_CPU BIT(6) +/* AR9331_HDR_RESERVED - not used or may be version field. + * According to the AR8216 doc it should 0b10. On AR9331 it is 0b11 on RX path + * and should be set to 0b11 to make it work. + */ +#define AR9331_HDR_RESERVED_MASK GENMASK(5, 4) +#define AR9331_HDR_PORT_NUM_MASK GENMASK(3, 0) + +static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + __le16 *phdr; + u16 hdr; + + if (skb_cow_head(skb, 0) < 0) + return NULL; + + phdr = skb_push(skb, AR9331_HDR_LEN); + + hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION); + hdr |= AR9331_HDR_FROM_CPU | dp->index; + /* 0b10 for AR8216 and 0b11 for AR9331 */ + hdr |= AR9331_HDR_RESERVED_MASK; + + phdr[0] = cpu_to_le16(hdr); + + return skb; +} + +static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt) +{ + u8 ver, port; + u16 hdr; + + if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN))) + return NULL; + + hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb)); + + ver = FIELD_GET(AR9331_HDR_VERSION_MASK, hdr); + if (unlikely(ver != AR9331_HDR_VERSION)) { + netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n", + __func__, __LINE__, hdr); + return NULL; + } + + if (unlikely(hdr & AR9331_HDR_FROM_CPU)) { + netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n", + __func__, __LINE__, hdr); + return NULL; + } + + skb_pull_rcsum(skb, AR9331_HDR_LEN); + + /* Get source port information */ + port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr); + + skb->dev = dsa_master_find_slave(ndev, 0, port); + if (!skb->dev) + return NULL; + + return skb; +} + +static const struct dsa_device_ops ar9331_netdev_ops = { + .name = "ar9331", + .proto = DSA_TAG_PROTO_AR9331, + .xmit = ar9331_tag_xmit, + .rcv = ar9331_tag_rcv, + .overhead = AR9331_HDR_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331); +module_dsa_tag_driver(ar9331_netdev_ops); -- cgit v1.2.3 From e1b5e598e5a51b453328879682b178b4acc15105 Mon Sep 17 00:00:00 2001 From: John Rutherford Date: Thu, 19 Dec 2019 16:03:57 +1100 Subject: tipc: make legacy address flag readable over netlink To enable iproute2/tipc to generate backwards compatible printouts and validate command parameters for nodes using a node address, it needs to be able to read the legacy address flag from the kernel. The legacy address flag records the way in which the node identity was originally specified. The legacy address flag is requested by the netlink message TIPC_NL_ADDR_LEGACY_GET. If the flag is set the attribute TIPC_NLA_NET_ADDR_LEGACY is set in the return message. Signed-off-by: John Rutherford Acked-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 ++ net/tipc/net.c | 56 +++++++++++++++++++++++++++++++++++++++ net/tipc/net.h | 1 + net/tipc/netlink.c | 6 +++++ 4 files changed, 65 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 6c2194ab745b..dc0d23a50e69 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -65,6 +65,7 @@ enum { TIPC_NL_UDP_GET_REMOTEIP, TIPC_NL_KEY_SET, TIPC_NL_KEY_FLUSH, + TIPC_NL_ADDR_LEGACY_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -176,6 +177,7 @@ enum { TIPC_NLA_NET_ADDR, /* u32 */ TIPC_NLA_NET_NODEID, /* u64 */ TIPC_NLA_NET_NODEID_W1, /* u64 */ + TIPC_NLA_NET_ADDR_LEGACY, /* flag */ __TIPC_NLA_NET_MAX, TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 diff --git a/net/tipc/net.c b/net/tipc/net.c index 2de3cec9929d..85400e4242de 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -302,3 +302,59 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return err; } + +static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) +{ + struct tipc_net *tn = tipc_net(net); + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_ADDR_LEGACY_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + if (!attrs) + goto msg_full; + + if (tn->legacy_addr_format) + if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + struct sk_buff *rep; + int err; + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_addr_legacy_get(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} diff --git a/net/tipc/net.h b/net/tipc/net.h index b7f2e364eb99..6740d97c706e 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index e53231bd23b4..7c35094c20b8 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -83,6 +83,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 }, [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 }, [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 }, + [TIPC_NLA_NET_ADDR_LEGACY] = { .type = NLA_FLAG } }; const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { @@ -273,6 +274,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_node_flush_key, }, #endif + { + .cmd = TIPC_NL_ADDR_LEGACY_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_net_addr_legacy_get, + }, }; struct genl_family tipc_genl_family __ro_after_init = { -- cgit v1.2.3 From e7dbfed1adb0e9123f582293e79760dc52b00d74 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Sat, 21 Dec 2019 08:50:01 +0530 Subject: net: skb_mpls_push() modified to allow MPLS header push at start of packet. The existing skb_mpls_push() implementation always inserts mpls header after the mac header. L2 VPN use cases requires MPLS header to be inserted before the ethernet header as the ethernet packet gets tunnelled inside MPLS header in those cases. Signed-off-by: Martin Varghese Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 973a71f4bc89..d90c8276a9ee 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5472,12 +5472,15 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, } /** - * skb_mpls_push() - push a new MPLS header after the mac header + * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of + * the packet * * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is + * ethernet * * Expects skb->data at mac header. * @@ -5501,7 +5504,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, return err; if (!skb->inner_protocol) { - skb_set_inner_network_header(skb, mac_len); + skb_set_inner_network_header(skb, skb_network_offset(skb)); skb_set_inner_protocol(skb, skb->protocol); } @@ -5510,6 +5513,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, mac_len); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); + skb_reset_mac_len(skb); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; -- cgit v1.2.3 From 76f99f987f2b7f95d43857b6e0362bd8dec9890c Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Sat, 21 Dec 2019 08:50:23 +0530 Subject: net: Rephrased comments section of skb_mpls_pop() Rephrased comments section of skb_mpls_pop() to align it with comments section of skb_mpls_push(). Signed-off-by: Martin Varghese Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d90c8276a9ee..44b0894d8ae1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5533,7 +5533,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); * @skb: buffer * @next_proto: ethertype of header after popped MPLS header * @mac_len: length of the MAC header - * @ethernet: flag to indicate if ethernet header is present in packet + * @ethernet: flag to indicate if the packet is ethernet * * Expects skb->data at mac header. * -- cgit v1.2.3 From f66b53fdbb22ced1a323b22b9de84a61aacd8d18 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Sat, 21 Dec 2019 08:50:46 +0530 Subject: openvswitch: New MPLS actions for layer 2 tunnelling The existing PUSH MPLS action inserts MPLS header between ethernet header and the IP header. Though this behaviour is fine for L3 VPN where an IP packet is encapsulated inside a MPLS tunnel, it does not suffice the L2 VPN (l2 tunnelling) requirements. In L2 VPN the MPLS header should encapsulate the ethernet packet. The new mpls action ADD_MPLS inserts MPLS header at the start of the packet or at the start of the l3 header depending on the value of l3 tunnel flag in the ADD_MPLS arguments. POP_MPLS action is extended to support ethertype 0x6558. Signed-off-by: Martin Varghese Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 31 +++++++++++++++++++++++++++++++ net/openvswitch/actions.c | 30 ++++++++++++++++++++++++------ net/openvswitch/flow_netlink.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a87b44cd5590..ae2bff14e7e1 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -672,6 +672,32 @@ struct ovs_action_push_mpls { __be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */ }; +/** + * struct ovs_action_add_mpls - %OVS_ACTION_ATTR_ADD_MPLS action + * argument. + * @mpls_lse: MPLS label stack entry to push. + * @mpls_ethertype: Ethertype to set in the encapsulating ethernet frame. + * @tun_flags: MPLS tunnel attributes. + * + * The only values @mpls_ethertype should ever be given are %ETH_P_MPLS_UC and + * %ETH_P_MPLS_MC, indicating MPLS unicast or multicast. Other are rejected. + */ +struct ovs_action_add_mpls { + __be32 mpls_lse; + __be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */ + __u16 tun_flags; +}; + +#define OVS_MPLS_L3_TUNNEL_FLAG_MASK (1 << 0) /* Flag to specify the place of + * insertion of MPLS header. + * When false, the MPLS header + * will be inserted at the start + * of the packet. + * When true, the MPLS header + * will be inserted at the start + * of the l3 header. + */ + /** * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument. * @vlan_tpid: Tag protocol identifier (TPID) to push. @@ -892,6 +918,10 @@ struct check_pkt_len_arg { * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set * of actions if greater than the specified packet length, else execute * another set of actions. + * @OVS_ACTION_ATTR_ADD_MPLS: Push a new MPLS label stack entry at the + * start of the packet or at the start of the l3 header depending on the value + * of l3 tunnel flag in the tun_flags field of OVS_ACTION_ATTR_ADD_MPLS + * argument. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -927,6 +957,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_METER, /* u32 meter ID. */ OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ + OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 4c8395462303..7fbfe2adfffa 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -161,16 +161,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct nlattr *attr, int len); static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_action_push_mpls *mpls) + __be32 mpls_lse, __be16 mpls_ethertype, __u16 mac_len) { int err; - err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype, - skb->mac_len, - ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); + err = skb_mpls_push(skb, mpls_lse, mpls_ethertype, mac_len, !!mac_len); if (err) return err; + if (!mac_len) + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); return 0; } @@ -185,6 +186,9 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (err) return err; + if (ethertype == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + invalidate_flow_key(key); return 0; } @@ -1229,10 +1233,24 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; - case OVS_ACTION_ATTR_PUSH_MPLS: - err = push_mpls(skb, key, nla_data(a)); + case OVS_ACTION_ATTR_PUSH_MPLS: { + struct ovs_action_push_mpls *mpls = nla_data(a); + + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, skb->mac_len); break; + } + case OVS_ACTION_ATTR_ADD_MPLS: { + struct ovs_action_add_mpls *mpls = nla_data(a); + __u16 mac_len = 0; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) + mac_len = skb->mac_len; + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, mac_len); + break; + } case OVS_ACTION_ATTR_POP_MPLS: err = pop_mpls(skb, key, nla_get_be16(a)); break; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 65c2e3458ff5..7da4230627f5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -79,6 +79,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_METER: case OVS_ACTION_ATTR_CHECK_PKT_LEN: + case OVS_ACTION_ATTR_ADD_MPLS: default: return true; } @@ -3005,6 +3006,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_METER] = sizeof(u32), [OVS_ACTION_ATTR_CLONE] = (u32)-1, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, + [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3072,6 +3074,33 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_ADD_MPLS: { + const struct ovs_action_add_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) { + if (vlan_tci & htons(VLAN_CFI_MASK) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + mpls_label_count++; + } else { + if (mac_proto == MAC_PROTO_ETHERNET) { + mpls_label_count = 1; + mac_proto = MAC_PROTO_NONE; + } else { + mpls_label_count++; + } + } + eth_type = mpls->mpls_ethertype; + break; + } + case OVS_ACTION_ATTR_PUSH_MPLS: { const struct ovs_action_push_mpls *mpls = nla_data(a); @@ -3109,6 +3138,11 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, * recirculation. */ proto = nla_get_be16(a); + + if (proto == htons(ETH_P_TEB) && + mac_proto != MAC_PROTO_NONE) + return -EINVAL; + mpls_label_count--; if (!eth_p_mpls(proto) || !mpls_label_count) -- cgit v1.2.3 From c10c4279c778df03f404a4d6906d7d4b840eb95f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:13 +0200 Subject: ipv6: Notify newly added route if should be offloaded fib6_add_rt2node() takes care of adding a single route ('struct fib6_info') to a FIB node. The route in question should only be notified in case it is added as the first route in the node (lowest metric) or if it is added as a sibling route to the first route in the node. The first criterion can be tested by checking if the route is pointed to by 'fn->leaf'. The second criterion can be tested by checking the new 'notify_sibling_rt' variable that is set when the route is added as a sibling to the first route in the node. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7bae6a91b487..045bcaf5e770 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1039,6 +1039,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; @@ -1130,6 +1131,7 @@ next_iter: /* Find the first route that have the same metric */ sibling = leaf; + notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { @@ -1139,6 +1141,7 @@ next_iter: } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); + notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -1166,6 +1169,21 @@ add: nlflags |= NLM_F_CREATE; if (!info->skip_notify_kernel) { + enum fib_event_type fib_event; + + if (notify_sibling_rt) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + /* The route should only be notified if it is the first + * route in the node or if it is added as a sibling + * route to the first route in the node. + */ + if (notify_sibling_rt || ins == &fn->leaf) + err = call_fib6_entry_notifiers(info->nl_net, + fib_event, rt, + extack); + err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt, extack); -- cgit v1.2.3 From 51bf7f387fdfe5ec8c33734b3124ccec83c8d0c3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:14 +0200 Subject: ipv6: Notify route if replacing currently offloaded one Similar to the corresponding IPv4 patch, only notify the new route if it is replacing the currently offloaded one. Meaning, the one pointed to by 'fn->leaf'. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 045bcaf5e770..7cf9554888b0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1231,6 +1231,13 @@ add: } if (!info->skip_notify_kernel) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + if (ins == &fn->leaf) + err = call_fib6_entry_notifiers(info->nl_net, + fib_event, rt, + extack); err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); -- cgit v1.2.3 From 0ee0f47c26b2f909ec95d7d8fed8119e288c4dd3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:15 +0200 Subject: ipv6: Notify multipath route if should be offloaded In a similar fashion to previous patches, only notify the new multipath route if it is the first route in the node or if it was appended to such route. The type of the notification (replace vs. append) is determined based on the number of routes added ('nhn') and the number of sibling routes. If the two do not match, then an append notification should be sent. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b59940416cb5..c0809f52f9ef 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5017,6 +5017,32 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); } +static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) +{ + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool should_notify = false; + struct fib6_info *leaf; + struct fib6_node *fn; + + rcu_read_lock(); + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + + leaf = rcu_dereference(fn->leaf); + if (!leaf) + goto out; + + if (rt == leaf || + (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf))) + should_notify = true; +out: + rcu_read_unlock(); + + return should_notify; +} + static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -5147,6 +5173,28 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } + /* An in-kernel notification should only be sent in case the new + * multipath route is added as the first route in the node, or if + * it was appended to it. We pass 'rt_notif' since it is the first + * sibling and might allow us to skip some checks in the replace case. + */ + if (ip6_route_mpath_should_notify(rt_notif)) { + enum fib_event_type fib_event; + + if (rt_notif->fib6_nsiblings != nhn - 1) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + + err = call_fib6_multipath_entry_notifiers(info->nl_net, + fib_event, rt_notif, + nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } + } event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, rt_notif, nhn - 1, extack); -- cgit v1.2.3 From 9c6ecd3cf62d0eb57539f966d7ad617ad3a59f4b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:16 +0200 Subject: ipv6: Only Replay routes of interest to new listeners When a new listener is registered to the FIB notification chain it receives a dump of all the available routes in the system. Instead, make sure to only replay the IPv6 routes that are actually used in the data path and are of any interest to the new listener. This is done by iterating over all the routing tables in the given namespace, but from each traversed node only the first route ('leaf') is notified. Multipath routes are notified in a single notification instead of one for each nexthop. Add fib6_rt_dump_tmp() to do that. Later on in the patch set it will be renamed to fib6_rt_dump() instead of the existing one. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7cf9554888b0..51cf848e38f0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -370,6 +370,21 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, return call_fib6_notifier(nb, event_type, &info.info); } +static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + .nsiblings = nsiblings, + }; + + return call_fib6_notifier(nb, event_type, &info.info); +} + int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, @@ -414,16 +429,41 @@ static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) rt, arg->extack); } +static int fib6_rt_dump_tmp(struct fib6_info *rt, struct fib6_dump_arg *arg) +{ + enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + int err; + + if (!rt || rt == arg->net->ipv6.fib6_null_entry) + return 0; + + if (rt->fib6_nsiblings) + err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, + rt, + rt->fib6_nsiblings, + arg->extack); + else + err = call_fib6_entry_notifier(arg->nb, fib_event, rt, + arg->extack); + + return err; +} + static int fib6_node_dump(struct fib6_walker *w) { struct fib6_info *rt; int err = 0; + err = fib6_rt_dump_tmp(w->leaf, w->args); + if (err) + goto out; + for_each_fib6_walker_rt(w) { err = fib6_rt_dump(rt, w->args); if (err) break; } +out: w->leaf = NULL; return err; } -- cgit v1.2.3 From d2f0c9b11410f9c6a07c126f8a215b0b81cdcf6c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:17 +0200 Subject: ipv6: Handle route deletion notification For the purpose of route offload, when a single route is deleted, it is only of interest if it is the first route in the node or if it is sibling to such a route. In the first case, distinguish between several possibilities: 1. Route is the last route in the node. Emit a delete notification 2. Route is followed by a non-multipath route. Emit a replace notification for the non-multipath route. 3. Route is followed by a multipath route. Emit a replace notification for the multipath route. In the second case, only emit a delete notification to ensure the route is no longer used as a valid nexthop. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + net/ipv6/ip6_fib.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f1535f172935..b579faea41e9 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -487,6 +487,7 @@ int call_fib6_multipath_entry_notifiers(struct net *net, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack); +int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 51cf848e38f0..67ddee539f77 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -415,6 +415,18 @@ int call_fib6_multipath_entry_notifiers(struct net *net, return call_fib6_notifiers(net, event_type, &info.info); } +int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) +{ + struct fib6_entry_notifier_info info = { + .rt = rt, + .nsiblings = rt->fib6_nsiblings, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE_TMP, + &info.info); +} + struct fib6_dump_arg { struct net *net; struct notifier_block *nb; @@ -1910,13 +1922,29 @@ static struct fib6_node *fib6_repair_tree(struct net *net, static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { + struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; + bool notify_del = false; RT6_TRACE("fib6_del_route\n"); + /* If the deleted route is the first in the node and it is not part of + * a multipath route, then we need to replace it with the next route + * in the node, if exists. + */ + leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (leaf == rt && !rt->fib6_nsiblings) { + if (rcu_access_pointer(rt->fib6_next)) + replace_rt = rcu_dereference_protected(rt->fib6_next, + lockdep_is_held(&table->tb6_lock)); + else + notify_del = true; + } + /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; @@ -1934,6 +1962,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; + /* The route is deleted from a multipath route. If this + * multipath route is the first route in the node, then we need + * to emit a delete notification. Otherwise, we need to skip + * the notification. + */ + if (rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf)) + notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; @@ -1969,8 +2005,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - if (!info->skip_notify_kernel) + if (!info->skip_notify_kernel) { + if (notify_del) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL_TMP, + rt, NULL); + else if (replace_rt) + call_fib6_entry_notifiers_replace(net, replace_rt); call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); -- cgit v1.2.3 From 0284696b97b2fad1b220871559dff410cc3187e0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:18 +0200 Subject: ipv6: Handle multipath route deletion notification When an entire multipath route is deleted, only emit a notification if it is the first route in the node. Emit a replace notification in case the last sibling is followed by another route. Otherwise, emit a delete notification. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c0809f52f9ef..646716a47cc9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3749,6 +3749,7 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; + struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); @@ -3764,7 +3765,32 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + /* 'rt' points to the first sibling route. If it is not the + * leaf, then we do not need to send a notification. Otherwise, + * we need to check if the last sibling has a next route or not + * and emit a replace or delete notification, respectively. + */ info->skip_notify_kernel = 1; + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&table->tb6_lock)); + if (rcu_access_pointer(fn->leaf) == rt) { + struct fib6_info *last_sibling, *replace_rt; + + last_sibling = list_last_entry(&rt->fib6_siblings, + struct fib6_info, + fib6_siblings); + replace_rt = rcu_dereference_protected( + last_sibling->fib6_next, + lockdep_is_held(&table->tb6_lock)); + if (replace_rt) + call_fib6_entry_notifiers_replace(net, + replace_rt); + else + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL_TMP, + rt, rt->fib6_nsiblings, + NULL); + } call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, -- cgit v1.2.3 From caafb2509fac1432849650826953dd88b7cbe374 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 23 Dec 2019 15:28:20 +0200 Subject: ipv6: Remove old route notifications and convert listeners Now that mlxsw is converted to use the new FIB notifications it is possible to delete the old ones and use the new replace / append / delete notifications. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 9 ++-- drivers/net/netdevsim/fib.c | 1 - include/net/fib_notifier.h | 2 - net/ipv6/ip6_fib.c | 61 ++++++---------------- net/ipv6/route.c | 18 +------ 5 files changed, 21 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 295cdcb1c4c0..f62e8d67348c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5966,7 +5966,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) mlxsw_sp_span_respin(mlxsw_sp); switch (fib_work->event) { - case FIB_EVENT_ENTRY_REPLACE_TMP: + case FIB_EVENT_ENTRY_REPLACE: err = mlxsw_sp_router_fib6_replace(mlxsw_sp, fib_work->fib6_work.rt_arr, fib_work->fib6_work.nrt6); @@ -5982,7 +5982,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) mlxsw_sp_router_fib_abort(mlxsw_sp); mlxsw_sp_router_fib6_work_fini(&fib_work->fib6_work); break; - case FIB_EVENT_ENTRY_DEL_TMP: + case FIB_EVENT_ENTRY_DEL: mlxsw_sp_router_fib6_del(mlxsw_sp, fib_work->fib6_work.rt_arr, fib_work->fib6_work.nrt6); @@ -6068,9 +6068,9 @@ static int mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work, int err; switch (fib_work->event) { - case FIB_EVENT_ENTRY_REPLACE_TMP: /* fall through */ + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ case FIB_EVENT_ENTRY_APPEND: /* fall through */ - case FIB_EVENT_ENTRY_DEL_TMP: + case FIB_EVENT_ENTRY_DEL: fen6_info = container_of(info, struct fib6_entry_notifier_info, info); err = mlxsw_sp_router_fib6_work_init(&fib_work->fib6_work, @@ -6174,7 +6174,6 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, return notifier_from_errno(err); case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_REPLACE: /* fall through */ - case FIB_EVENT_ENTRY_REPLACE_TMP: /* fall through */ case FIB_EVENT_ENTRY_APPEND: if (router->aborted) { NL_SET_ERR_MSG_MOD(info->extack, "FIB offload was aborted. Not configuring route"); diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 4e02a4231fcb..b5df308b4e33 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -178,7 +178,6 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, break; case FIB_EVENT_ENTRY_REPLACE: /* fall through */ - case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_DEL: err = nsim_fib_event(data, info, event != FIB_EVENT_ENTRY_DEL); break; diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index b3c54325caec..6d59221ff05a 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -23,8 +23,6 @@ enum fib_event_type { FIB_EVENT_NH_DEL, FIB_EVENT_VIF_ADD, FIB_EVENT_VIF_DEL, - FIB_EVENT_ENTRY_REPLACE_TMP, - FIB_EVENT_ENTRY_DEL_TMP, }; struct fib_notifier_ops { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 67ddee539f77..b1e9a10e1133 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -423,8 +423,7 @@ int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) }; rt->fib6_table->fib_seq++; - return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE_TMP, - &info.info); + return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { @@ -435,15 +434,7 @@ struct fib6_dump_arg { static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { - if (rt == arg->net->ipv6.fib6_null_entry) - return 0; - return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, - rt, arg->extack); -} - -static int fib6_rt_dump_tmp(struct fib6_info *rt, struct fib6_dump_arg *arg) -{ - enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) @@ -463,19 +454,9 @@ static int fib6_rt_dump_tmp(struct fib6_info *rt, struct fib6_dump_arg *arg) static int fib6_node_dump(struct fib6_walker *w) { - struct fib6_info *rt; - int err = 0; - - err = fib6_rt_dump_tmp(w->leaf, w->args); - if (err) - goto out; + int err; - for_each_fib6_walker_rt(w) { - err = fib6_rt_dump(rt, w->args); - if (err) - break; - } -out: + err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } @@ -1220,25 +1201,21 @@ next_iter: add: nlflags |= NLM_F_CREATE; - if (!info->skip_notify_kernel) { + /* The route should only be notified if it is the first + * route in the node or if it is added as a sibling + * route to the first route in the node. + */ + if (!info->skip_notify_kernel && + (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else - fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; - /* The route should only be notified if it is the first - * route in the node or if it is added as a sibling - * route to the first route in the node. - */ - if (notify_sibling_rt || ins == &fn->leaf) - err = call_fib6_entry_notifiers(info->nl_net, - fib_event, rt, - extack); - + fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); + fib_event, rt, + extack); if (err) { struct fib6_info *sibling, *next_sibling; @@ -1282,14 +1259,7 @@ add: return -ENOENT; } - if (!info->skip_notify_kernel) { - enum fib_event_type fib_event; - - fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; - if (ins == &fn->leaf) - err = call_fib6_entry_notifiers(info->nl_net, - fib_event, rt, - extack); + if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -2007,11 +1977,10 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, if (!info->skip_notify_kernel) { if (notify_del) - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL_TMP, + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 646716a47cc9..4b8659e077d3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3787,15 +3787,10 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) replace_rt); else call_fib6_multipath_entry_notifiers(net, - FIB_EVENT_ENTRY_DEL_TMP, + FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL); } - call_fib6_multipath_entry_notifiers(net, - FIB_EVENT_ENTRY_DEL, - rt, - rt->fib6_nsiblings, - NULL); list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -5074,7 +5069,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; - enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -5210,7 +5204,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, if (rt_notif->fib6_nsiblings != nhn - 1) fib_event = FIB_EVENT_ENTRY_APPEND; else - fib_event = FIB_EVENT_ENTRY_REPLACE_TMP; + fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_multipath_entry_notifiers(info->nl_net, fib_event, rt_notif, @@ -5221,14 +5215,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto add_errout; } } - event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; - err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, - rt_notif, nhn - 1, extack); - if (err) { - /* Delete all the siblings that were just added */ - err_nh = NULL; - goto add_errout; - } /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); -- cgit v1.2.3 From dfe6d68fc4049c799b2194d88523679a694ead75 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 25 Dec 2019 18:16:11 -0800 Subject: net: vlan: Use the PHY time stamping interface. The vlan layer tests fields of the phy_device in order to determine whether to invoke the PHY's tsinfo ethtool callback. This patch replaces the open coded logic with an invocation of the proper methods. Signed-off-by: Richard Cochran Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e5bff5cc6f97..5ff8059837b4 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -646,8 +646,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; struct phy_device *phydev = vlan->real_dev->phydev; - if (phydev && phydev->drv && phydev->drv->ts_info) { - return phydev->drv->ts_info(phydev, info); + if (phy_has_tsinfo(phydev)) { + return phy_ts_info(phydev, info); } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { -- cgit v1.2.3 From 7774ee23689d3c56ecb5eff6cd1bf6060bdd3d8e Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 25 Dec 2019 18:16:12 -0800 Subject: net: ethtool: Use the PHY time stamping interface. The ethtool layer tests fields of the phy_device in order to determine whether to invoke the PHY's tsinfo ethtool callback. This patch replaces the open coded logic with an invocation of the proper methods. Signed-off-by: Richard Cochran Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index aed2c2cf1623..88f7cddf5a6f 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2096,8 +2096,8 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GET_TS_INFO; - if (phydev && phydev->drv && phydev->drv->ts_info) { - err = phydev->drv->ts_info(phydev, &info); + if (phy_has_tsinfo(phydev)) { + err = phy_ts_info(phydev, &info); } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, &info); } else { -- cgit v1.2.3 From 4715f65ffa0520af0680dbfbedbe349f175adaf4 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 25 Dec 2019 18:16:15 -0800 Subject: net: Introduce a new MII time stamping interface. Currently the stack supports time stamping in PHY devices. However, there are newer, non-PHY devices that can snoop an MII bus and provide time stamps. In order to support such devices, this patch introduces a new interface to be used by both PHY and non-PHY devices. In addition, the one and only user of the old PHY time stamping API is converted to the new interface. Signed-off-by: Richard Cochran Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 39 ++++++++++++++++----------- drivers/net/phy/phy.c | 4 +-- drivers/net/phy/phy_device.c | 2 ++ include/linux/mii_timestamper.h | 58 +++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 41 +++++++---------------------- net/core/timestamping.c | 20 +++++++------- 6 files changed, 106 insertions(+), 58 deletions(-) create mode 100644 include/linux/mii_timestamper.h (limited to 'net') diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index b58abdb5491e..ac72a324fcd1 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -98,6 +98,7 @@ struct dp83640_private { struct list_head list; struct dp83640_clock *clock; struct phy_device *phydev; + struct mii_timestamper mii_ts; struct delayed_work ts_work; int hwts_tx_en; int hwts_rx_en; @@ -1229,9 +1230,10 @@ static int dp83640_config_intr(struct phy_device *phydev) } } -static int dp83640_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) +static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr) { - struct dp83640_private *dp83640 = phydev->priv; + struct dp83640_private *dp83640 = + container_of(mii_ts, struct dp83640_private, mii_ts); struct hwtstamp_config cfg; u16 txcfg0, rxcfg0; @@ -1307,8 +1309,8 @@ static int dp83640_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) mutex_lock(&dp83640->clock->extreg_lock); - ext_write(0, phydev, PAGE5, PTP_TXCFG0, txcfg0); - ext_write(0, phydev, PAGE5, PTP_RXCFG0, rxcfg0); + ext_write(0, dp83640->phydev, PAGE5, PTP_TXCFG0, txcfg0); + ext_write(0, dp83640->phydev, PAGE5, PTP_RXCFG0, rxcfg0); mutex_unlock(&dp83640->clock->extreg_lock); @@ -1338,10 +1340,11 @@ static void rx_timestamp_work(struct work_struct *work) schedule_delayed_work(&dp83640->ts_work, SKB_TIMESTAMP_TIMEOUT); } -static bool dp83640_rxtstamp(struct phy_device *phydev, +static bool dp83640_rxtstamp(struct mii_timestamper *mii_ts, struct sk_buff *skb, int type) { - struct dp83640_private *dp83640 = phydev->priv; + struct dp83640_private *dp83640 = + container_of(mii_ts, struct dp83640_private, mii_ts); struct dp83640_skb_info *skb_info = (struct dp83640_skb_info *)skb->cb; struct list_head *this, *next; struct rxts *rxts; @@ -1387,11 +1390,12 @@ static bool dp83640_rxtstamp(struct phy_device *phydev, return true; } -static void dp83640_txtstamp(struct phy_device *phydev, +static void dp83640_txtstamp(struct mii_timestamper *mii_ts, struct sk_buff *skb, int type) { struct dp83640_skb_info *skb_info = (struct dp83640_skb_info *)skb->cb; - struct dp83640_private *dp83640 = phydev->priv; + struct dp83640_private *dp83640 = + container_of(mii_ts, struct dp83640_private, mii_ts); switch (dp83640->hwts_tx_en) { @@ -1414,9 +1418,11 @@ static void dp83640_txtstamp(struct phy_device *phydev, } } -static int dp83640_ts_info(struct phy_device *dev, struct ethtool_ts_info *info) +static int dp83640_ts_info(struct mii_timestamper *mii_ts, + struct ethtool_ts_info *info) { - struct dp83640_private *dp83640 = dev->priv; + struct dp83640_private *dp83640 = + container_of(mii_ts, struct dp83640_private, mii_ts); info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE | @@ -1454,13 +1460,18 @@ static int dp83640_probe(struct phy_device *phydev) goto no_memory; dp83640->phydev = phydev; - INIT_DELAYED_WORK(&dp83640->ts_work, rx_timestamp_work); + dp83640->mii_ts.rxtstamp = dp83640_rxtstamp; + dp83640->mii_ts.txtstamp = dp83640_txtstamp; + dp83640->mii_ts.hwtstamp = dp83640_hwtstamp; + dp83640->mii_ts.ts_info = dp83640_ts_info; + INIT_DELAYED_WORK(&dp83640->ts_work, rx_timestamp_work); INIT_LIST_HEAD(&dp83640->rxts); INIT_LIST_HEAD(&dp83640->rxpool); for (i = 0; i < MAX_RXTS; i++) list_add(&dp83640->rx_pool_data[i].list, &dp83640->rxpool); + phydev->mii_ts = &dp83640->mii_ts; phydev->priv = dp83640; spin_lock_init(&dp83640->rx_lock); @@ -1501,6 +1512,8 @@ static void dp83640_remove(struct phy_device *phydev) if (phydev->mdio.addr == BROADCAST_ADDR) return; + phydev->mii_ts = NULL; + enable_status_frames(phydev, false); cancel_delayed_work_sync(&dp83640->ts_work); @@ -1537,10 +1550,6 @@ static struct phy_driver dp83640_driver = { .config_init = dp83640_config_init, .ack_interrupt = dp83640_ack_interrupt, .config_intr = dp83640_config_intr, - .ts_info = dp83640_ts_info, - .hwtstamp = dp83640_hwtstamp, - .rxtstamp = dp83640_rxtstamp, - .txtstamp = dp83640_txtstamp, }; static int __init dp83640_init(void) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 80be4d691e5b..541ed01496bf 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -422,8 +422,8 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd) return 0; case SIOCSHWTSTAMP: - if (phydev->drv && phydev->drv->hwtstamp) - return phydev->drv->hwtstamp(phydev, ifr); + if (phydev->mii_ts && phydev->mii_ts->hwtstamp) + return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); /* fall through */ default: diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 94961e7592e4..e55119cd6c86 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -919,6 +919,8 @@ static void phy_link_change(struct phy_device *phydev, bool up, bool do_carrier) netif_carrier_off(netdev); } phydev->adjust_link(netdev); + if (phydev->mii_ts && phydev->mii_ts->link_state) + phydev->mii_ts->link_state(phydev->mii_ts, phydev); } /** diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h new file mode 100644 index 000000000000..36002386029c --- /dev/null +++ b/include/linux/mii_timestamper.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for generic time stamping devices on MII buses. + * Copyright (C) 2018 Richard Cochran + */ +#ifndef _LINUX_MII_TIMESTAMPER_H +#define _LINUX_MII_TIMESTAMPER_H + +#include +#include +#include + +struct phy_device; + +/** + * struct mii_timestamper - Callback interface to MII time stamping devices. + * + * @rxtstamp: Requests a Rx timestamp for 'skb'. If the skb is accepted, + * the MII time stamping device promises to deliver it using + * netif_rx() as soon as a timestamp becomes available. One of + * the PTP_CLASS_ values is passed in 'type'. The function + * must return true if the skb is accepted for delivery. + * + * @txtstamp: Requests a Tx timestamp for 'skb'. The MII time stamping + * device promises to deliver it using skb_complete_tx_timestamp() + * as soon as a timestamp becomes available. One of the PTP_CLASS_ + * values is passed in 'type'. + * + * @hwtstamp: Handles SIOCSHWTSTAMP ioctl for hardware time stamping. + * + * @link_state: Allows the device to respond to changes in the link + * state. The caller invokes this function while holding + * the phy_device mutex. + * + * @ts_info: Handles ethtool queries for hardware time stamping. + * + * Drivers for PHY time stamping devices should embed their + * mii_timestamper within a private structure, obtaining a reference + * to it using container_of(). + */ +struct mii_timestamper { + bool (*rxtstamp)(struct mii_timestamper *mii_ts, + struct sk_buff *skb, int type); + + void (*txtstamp)(struct mii_timestamper *mii_ts, + struct sk_buff *skb, int type); + + int (*hwtstamp)(struct mii_timestamper *mii_ts, + struct ifreq *ifreq); + + void (*link_state)(struct mii_timestamper *mii_ts, + struct phy_device *phydev); + + int (*ts_info)(struct mii_timestamper *mii_ts, + struct ethtool_ts_info *ts_info); +}; + +#endif diff --git a/include/linux/phy.h b/include/linux/phy.h index 0248f5e9939d..30e599c454db 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -441,6 +442,7 @@ struct phy_device { struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; + struct mii_timestamper *mii_ts; u8 mdix; u8 mdix_ctrl; @@ -546,29 +548,6 @@ struct phy_driver { */ int (*match_phy_device)(struct phy_device *phydev); - /* Handles ethtool queries for hardware time stamping. */ - int (*ts_info)(struct phy_device *phydev, struct ethtool_ts_info *ti); - - /* Handles SIOCSHWTSTAMP ioctl for hardware time stamping. */ - int (*hwtstamp)(struct phy_device *phydev, struct ifreq *ifr); - - /* - * Requests a Rx timestamp for 'skb'. If the skb is accepted, - * the phy driver promises to deliver it using netif_rx() as - * soon as a timestamp becomes available. One of the - * PTP_CLASS_ values is passed in 'type'. The function must - * return true if the skb is accepted for delivery. - */ - bool (*rxtstamp)(struct phy_device *dev, struct sk_buff *skb, int type); - - /* - * Requests a Tx timestamp for 'skb'. The phy driver promises - * to deliver it using skb_complete_tx_timestamp() as soon as a - * timestamp becomes available. One of the PTP_CLASS_ values - * is passed in 'type'. - */ - void (*txtstamp)(struct phy_device *dev, struct sk_buff *skb, int type); - /* Some devices (e.g. qnap TS-119P II) require PHY register changes to * enable Wake on LAN, so set_wol is provided to be called in the * ethernet driver's set_wol function. */ @@ -942,7 +921,7 @@ static inline bool phy_polling_mode(struct phy_device *phydev) */ static inline bool phy_has_hwtstamp(struct phy_device *phydev) { - return phydev && phydev->drv && phydev->drv->hwtstamp; + return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp; } /** @@ -951,7 +930,7 @@ static inline bool phy_has_hwtstamp(struct phy_device *phydev) */ static inline bool phy_has_rxtstamp(struct phy_device *phydev) { - return phydev && phydev->drv && phydev->drv->rxtstamp; + return phydev && phydev->mii_ts && phydev->mii_ts->rxtstamp; } /** @@ -961,7 +940,7 @@ static inline bool phy_has_rxtstamp(struct phy_device *phydev) */ static inline bool phy_has_tsinfo(struct phy_device *phydev) { - return phydev && phydev->drv && phydev->drv->ts_info; + return phydev && phydev->mii_ts && phydev->mii_ts->ts_info; } /** @@ -970,30 +949,30 @@ static inline bool phy_has_tsinfo(struct phy_device *phydev) */ static inline bool phy_has_txtstamp(struct phy_device *phydev) { - return phydev && phydev->drv && phydev->drv->txtstamp; + return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) { - return phydev->drv->hwtstamp(phydev, ifr); + return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { - return phydev->drv->rxtstamp(phydev, skb, type); + return phydev->mii_ts->rxtstamp(phydev->mii_ts, skb, type); } static inline int phy_ts_info(struct phy_device *phydev, struct ethtool_ts_info *tsinfo) { - return phydev->drv->ts_info(phydev, tsinfo); + return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo); } static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { - phydev->drv->txtstamp(phydev, skb, type); + phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type); } /** diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 7911235706a9..04840697fe79 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -13,7 +13,7 @@ static unsigned int classify(const struct sk_buff *skb) { if (likely(skb->dev && skb->dev->phydev && - skb->dev->phydev->drv)) + skb->dev->phydev->mii_ts)) return ptp_classify_raw(skb); else return PTP_CLASS_NONE; @@ -21,7 +21,7 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { - struct phy_device *phydev; + struct mii_timestamper *mii_ts; struct sk_buff *clone; unsigned int type; @@ -32,22 +32,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return; - phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) return; - phydev->drv->txtstamp(phydev, clone, type); + mii_ts->txtstamp(mii_ts, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { - struct phy_device *phydev; + struct mii_timestamper *mii_ts; unsigned int type; - if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) return false; if (skb_headroom(skb) < ETH_HLEN) @@ -62,9 +62,9 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->rxtstamp)) + return mii_ts->rxtstamp(mii_ts, skb, type); return false; } -- cgit v1.2.3 From 767ff483731502a0fc34f34a3a0851aca175eb71 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 25 Dec 2019 18:16:16 -0800 Subject: net: Add a layer for non-PHY MII time stamping drivers. While PHY time stamping drivers can simply attach their interface directly to the PHY instance, stand alone drivers require support in order to manage their services. Non-PHY MII time stamping drivers have a control interface over another bus like I2C, SPI, UART, or via a memory mapped peripheral. The controller device will be associated with one or more time stamping channels, each of which sits snoops in on a MII bus. This patch provides a glue layer that will enable time stamping channels to find their controlling device. Signed-off-by: Richard Cochran Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/Makefile | 2 + drivers/net/phy/mii_timestamper.c | 125 ++++++++++++++++++++++++++++++++++++++ include/linux/mii_timestamper.h | 63 +++++++++++++++++++ net/Kconfig | 7 ++- 4 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 drivers/net/phy/mii_timestamper.c (limited to 'net') diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index d846b4dc1c68..fe5badf13b65 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -43,6 +43,8 @@ obj-$(CONFIG_MDIO_SUN4I) += mdio-sun4i.o obj-$(CONFIG_MDIO_THUNDER) += mdio-thunder.o obj-$(CONFIG_MDIO_XGENE) += mdio-xgene.o +obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += mii_timestamper.o + obj-$(CONFIG_SFP) += sfp.o sfp-obj-$(CONFIG_SFP) += sfp-bus.o obj-y += $(sfp-obj-y) $(sfp-obj-m) diff --git a/drivers/net/phy/mii_timestamper.c b/drivers/net/phy/mii_timestamper.c new file mode 100644 index 000000000000..2f12c5d901df --- /dev/null +++ b/drivers/net/phy/mii_timestamper.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Support for generic time stamping devices on MII buses. +// Copyright (C) 2018 Richard Cochran +// + +#include + +static LIST_HEAD(mii_timestamping_devices); +static DEFINE_MUTEX(tstamping_devices_lock); + +struct mii_timestamping_desc { + struct list_head list; + struct mii_timestamping_ctrl *ctrl; + struct device *device; +}; + +/** + * register_mii_tstamp_controller() - registers an MII time stamping device. + * + * @device: The device to be registered. + * @ctrl: Pointer to device's control interface. + * + * Returns zero on success or non-zero on failure. + */ +int register_mii_tstamp_controller(struct device *device, + struct mii_timestamping_ctrl *ctrl) +{ + struct mii_timestamping_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return -ENOMEM; + + INIT_LIST_HEAD(&desc->list); + desc->ctrl = ctrl; + desc->device = device; + + mutex_lock(&tstamping_devices_lock); + list_add_tail(&mii_timestamping_devices, &desc->list); + mutex_unlock(&tstamping_devices_lock); + + return 0; +} +EXPORT_SYMBOL(register_mii_tstamp_controller); + +/** + * unregister_mii_tstamp_controller() - unregisters an MII time stamping device. + * + * @device: A device previously passed to register_mii_tstamp_controller(). + */ +void unregister_mii_tstamp_controller(struct device *device) +{ + struct mii_timestamping_desc *desc; + struct list_head *this, *next; + + mutex_lock(&tstamping_devices_lock); + list_for_each_safe(this, next, &mii_timestamping_devices) { + desc = list_entry(this, struct mii_timestamping_desc, list); + if (desc->device == device) { + list_del_init(&desc->list); + kfree(desc); + break; + } + } + mutex_unlock(&tstamping_devices_lock); +} +EXPORT_SYMBOL(unregister_mii_tstamp_controller); + +/** + * register_mii_timestamper - Enables a given port of an MII time stamper. + * + * @node: The device tree node of the MII time stamp controller. + * @port: The index of the port to be enabled. + * + * Returns a valid interface on success or ERR_PTR otherwise. + */ +struct mii_timestamper *register_mii_timestamper(struct device_node *node, + unsigned int port) +{ + struct mii_timestamper *mii_ts = NULL; + struct mii_timestamping_desc *desc; + struct list_head *this; + + mutex_lock(&tstamping_devices_lock); + list_for_each(this, &mii_timestamping_devices) { + desc = list_entry(this, struct mii_timestamping_desc, list); + if (desc->device->of_node == node) { + mii_ts = desc->ctrl->probe_channel(desc->device, port); + if (!IS_ERR(mii_ts)) { + mii_ts->device = desc->device; + get_device(desc->device); + } + break; + } + } + mutex_unlock(&tstamping_devices_lock); + + return mii_ts ? mii_ts : ERR_PTR(-EPROBE_DEFER); +} +EXPORT_SYMBOL(register_mii_timestamper); + +/** + * unregister_mii_timestamper - Disables a given MII time stamper. + * + * @mii_ts: An interface obtained via register_mii_timestamper(). + * + */ +void unregister_mii_timestamper(struct mii_timestamper *mii_ts) +{ + struct mii_timestamping_desc *desc; + struct list_head *this; + + mutex_lock(&tstamping_devices_lock); + list_for_each(this, &mii_timestamping_devices) { + desc = list_entry(this, struct mii_timestamping_desc, list); + if (desc->device == mii_ts->device) { + desc->ctrl->release_channel(desc->device, mii_ts); + put_device(desc->device); + break; + } + } + mutex_unlock(&tstamping_devices_lock); +} +EXPORT_SYMBOL(unregister_mii_timestamper); diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index 36002386029c..fa940bbaf8ae 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -33,10 +33,15 @@ struct phy_device; * the phy_device mutex. * * @ts_info: Handles ethtool queries for hardware time stamping. + * @device: Remembers the device to which the instance belongs. * * Drivers for PHY time stamping devices should embed their * mii_timestamper within a private structure, obtaining a reference * to it using container_of(). + * + * Drivers for non-PHY time stamping devices should return a pointer + * to a mii_timestamper from the probe_channel() callback of their + * mii_timestamping_ctrl interface. */ struct mii_timestamper { bool (*rxtstamp)(struct mii_timestamper *mii_ts, @@ -53,6 +58,64 @@ struct mii_timestamper { int (*ts_info)(struct mii_timestamper *mii_ts, struct ethtool_ts_info *ts_info); + + struct device *device; +}; + +/** + * struct mii_timestamping_ctrl - MII time stamping controller interface. + * + * @probe_channel: Callback into the controller driver announcing the + * presence of the 'port' channel. The 'device' field + * had been passed to register_mii_tstamp_controller(). + * The driver must return either a pointer to a valid + * MII timestamper instance or PTR_ERR. + * + * @release_channel: Releases an instance obtained via .probe_channel. + */ +struct mii_timestamping_ctrl { + struct mii_timestamper *(*probe_channel)(struct device *device, + unsigned int port); + void (*release_channel)(struct device *device, + struct mii_timestamper *mii_ts); }; +#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING + +int register_mii_tstamp_controller(struct device *device, + struct mii_timestamping_ctrl *ctrl); + +void unregister_mii_tstamp_controller(struct device *device); + +struct mii_timestamper *register_mii_timestamper(struct device_node *node, + unsigned int port); + +void unregister_mii_timestamper(struct mii_timestamper *mii_ts); + +#else + +static inline +int register_mii_tstamp_controller(struct device *device, + struct mii_timestamping_ctrl *ctrl) +{ + return -EOPNOTSUPP; +} + +static inline void unregister_mii_tstamp_controller(struct device *device) +{ +} + +static inline +struct mii_timestamper *register_mii_timestamper(struct device_node *node, + unsigned int port) +{ + return NULL; +} + +static inline void unregister_mii_timestamper(struct mii_timestamper *mii_ts) +{ +} + +#endif + #endif diff --git a/net/Kconfig b/net/Kconfig index bd191f978a23..52af65e5d28c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -108,9 +108,10 @@ config NETWORK_PHY_TIMESTAMPING bool "Timestamping in PHY devices" select NET_PTP_CLASSIFY help - This allows timestamping of network packets by PHYs with - hardware timestamping capabilities. This option adds some - overhead in the transmit and receive paths. + This allows timestamping of network packets by PHYs (or + other MII bus snooping devices) with hardware timestamping + capabilities. This option adds some overhead in the transmit + and receive paths. If you are unsure how to answer this question, answer N. -- cgit v1.2.3 From b6fd7b96366769651ab23988607ce9c5c9042cdb Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 25 Dec 2019 18:16:19 -0800 Subject: net: Introduce peer to peer one step PTP time stamping. The 1588 standard defines one step operation for both Sync and PDelay_Resp messages. Up until now, hardware with P2P one step has been rare, and kernel support was lacking. This patch adds support of the mode in anticipation of new hardware developments. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 1 + drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c | 1 + drivers/net/ethernet/microchip/lan743x_ptp.c | 3 +++ drivers/net/ethernet/qlogic/qede/qede_ptp.c | 1 + include/uapi/linux/net_tstamp.h | 8 ++++++++ net/core/dev_ioctl.c | 1 + 6 files changed, 15 insertions(+) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index cff64e43bdd8..741d865e4afc 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -15410,6 +15410,7 @@ int bnx2x_configure_ptp_filters(struct bnx2x *bp) REG_WR(bp, rule, BNX2X_PTP_TX_ON_RULE_MASK); break; case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: BNX2X_ERR("One-step timestamping is not supported\n"); return -ERANGE; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c index ec2ff3d7f41c..4aaaa4937b1a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c @@ -920,6 +920,7 @@ static int mlxsw_sp_ptp_get_message_types(const struct hwtstamp_config *config, egr_types = 0xff; break; case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: return -ERANGE; } diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index afe52463dc57..9399f6a98748 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -1265,6 +1265,9 @@ int lan743x_ptp_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) lan743x_ptp_set_sync_ts_insert(adapter, true); break; + case HWTSTAMP_TX_ONESTEP_P2P: + ret = -ERANGE; + break; default: netif_warn(adapter, drv, adapter->netdev, " tx_type = %d, UNKNOWN\n", config.tx_type); diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c index f815435cf106..4c7f7a7fc151 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c @@ -247,6 +247,7 @@ static int qede_ptp_cfg_filters(struct qede_dev *edev) break; case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: DP_ERR(edev, "One-step timestamping is not supported\n"); return -ERANGE; } diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index e5b39721c6e4..f96e650d0af9 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -90,6 +90,14 @@ enum hwtstamp_tx_types { * queue. */ HWTSTAMP_TX_ONESTEP_SYNC, + + /* + * Same as HWTSTAMP_TX_ONESTEP_SYNC, but also enables time + * stamp insertion directly into PDelay_Resp packets. In this + * case, neither transmitted Sync nor PDelay_Resp packets will + * receive a time stamp via the socket error queue. + */ + HWTSTAMP_TX_ONESTEP_P2P, }; /* possible values for hwtstamp_config->rx_filter */ diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 5163d900bb4f..dbaebbe573f0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -187,6 +187,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; } -- cgit v1.2.3 From db8f6f5c8de6dae924a68858ad6a4217f735be13 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:13 +0100 Subject: netfilter: nft_meta: move time handling to helper reduce size of the (large) meta evaluation function. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 9740b554fdb3..ba74f3ee7264 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -33,8 +33,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -static u8 nft_meta_weekday(time64_t secs) +static u8 nft_meta_weekday(void) { + time64_t secs = ktime_get_real_seconds(); unsigned int dse; u8 wday; @@ -56,6 +57,25 @@ static u32 nft_meta_hour(time64_t secs) + tm.tm_sec; } +static noinline_for_stack void +nft_meta_get_eval_time(enum nft_meta_keys key, + u32 *dest) +{ + switch (key) { + case NFT_META_TIME_NS: + nft_reg_store64(dest, ktime_get_real_ns()); + break; + case NFT_META_TIME_DAY: + nft_reg_store8(dest, nft_meta_weekday()); + break; + case NFT_META_TIME_HOUR: + *dest = nft_meta_hour(ktime_get_real_seconds()); + break; + default: + break; + } +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -247,13 +267,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); - break; case NFT_META_TIME_DAY: - nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds())); - break; case NFT_META_TIME_HOUR: - *dest = nft_meta_hour(ktime_get_real_seconds()); + nft_meta_get_eval_time(priv->key, dest); break; default: WARN_ON(1); -- cgit v1.2.3 From 4a54594abdbee230a0471aa137b3d5405c897661 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:14 +0100 Subject: netfilter: nft_meta: move pkttype handling to helper When pkttype is loopback, nft_meta performs guesswork to detect broad/multicast packets. Place this in a helper, this is hardly a hot path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 90 +++++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ba74f3ee7264..fe49b27dfa87 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -76,6 +76,56 @@ nft_meta_get_eval_time(enum nft_meta_keys key, } } +static noinline bool +nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, + u32 *dest) +{ + const struct sk_buff *skb = pkt->skb; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + break; + case NFPROTO_IPV6: + nft_reg_store8(dest, PACKET_MULTICAST); + break; + case NFPROTO_NETDEV: + switch (skb->protocol) { + case htons(ETH_P_IP): { + int noff = skb_network_offset(skb); + struct iphdr *iph, _iph; + + iph = skb_header_pointer(skb, noff, + sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ipv4_is_multicast(iph->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + + break; + } + case htons(ETH_P_IPV6): + nft_reg_store8(dest, PACKET_MULTICAST); + break; + default: + WARN_ON_ONCE(1); + return false; + } + break; + default: + WARN_ON_ONCE(1); + return false; + } + + return true; +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -183,46 +233,8 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - break; - case NFPROTO_IPV6: - nft_reg_store8(dest, PACKET_MULTICAST); - break; - case NFPROTO_NETDEV: - switch (skb->protocol) { - case htons(ETH_P_IP): { - int noff = skb_network_offset(skb); - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, noff, - sizeof(_iph), &_iph); - if (!iph) - goto err; - - if (ipv4_is_multicast(iph->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - - break; - } - case htons(ETH_P_IPV6): - nft_reg_store8(dest, PACKET_MULTICAST); - break; - default: - WARN_ON_ONCE(1); - goto err; - } - break; - default: - WARN_ON_ONCE(1); + if (!nft_meta_get_eval_pkttype_lo(pkt, dest)) goto err; - } break; case NFT_META_CPU: *dest = raw_smp_processor_id(); -- cgit v1.2.3 From 726b44f044e8e67cbe2209c1a5704aca981be3b2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:15 +0100 Subject: netfilter: nft_meta: move sk uid/git handling to helper Not a hot path. Also, both have copy&paste case statements, so use a common helper for both. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 65 +++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index fe49b27dfa87..1b32440ec2e6 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -126,6 +126,41 @@ nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, return true; } +static noinline bool +nft_meta_get_eval_skugid(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct socket *sock; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + read_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + if (!sock || !sock->file) { + read_unlock_bh(&sk->sk_callback_lock); + return false; + } + + switch (key) { + case NFT_META_SKUID: + *dest = from_kuid_munged(&init_user_ns, + sock->file->f_cred->fsuid); + break; + case NFT_META_SKGID: + *dest = from_kgid_munged(&init_user_ns, + sock->file->f_cred->fsgid); + break; + default: + break; + } + + read_unlock_bh(&sk->sk_callback_lock); + return true; +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -180,37 +215,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - - *dest = from_kuid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&sk->sk_callback_lock); - break; case NFT_META_SKGID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_skugid(priv->key, dest, pkt)) goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - *dest = from_kgid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: { -- cgit v1.2.3 From b1327fbc29915ef5bf0eec5abc2e2e67983cb037 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:16 +0100 Subject: netfilter: nft_meta: move cgroup handling to helper Reduce size of main eval function. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1b32440ec2e6..3fca1c3ec361 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -161,6 +161,20 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key, return true; } +#ifdef CONFIG_CGROUP_NET_CLASSID +static noinline bool +nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); + return true; +} +#endif + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -168,7 +182,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -258,11 +271,8 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_cgroup(dest, pkt)) goto err; - *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif case NFT_META_PRANDOM: { -- cgit v1.2.3 From a4150a1faa3604beef25ed2374d537f86059ad52 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:17 +0100 Subject: netfilter: nft_meta: move interface kind handling to helper checkpatch complains about == NULL checks in original code, so use !in instead. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 3fca1c3ec361..2f7cc64b0c15 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -175,6 +175,30 @@ nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) } #endif +static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + + switch (key) { + case NFT_META_IIFKIND: + if (!in || !in->rtnl_link_ops) + return false; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (!out || !out->rtnl_link_ops) + return false; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + default: + return false; + } + + return true; +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -286,14 +310,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; #endif case NFT_META_IIFKIND: - if (in == NULL || in->rtnl_link_ops == NULL) - goto err; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); - break; case NFT_META_OIFKIND: - if (out == NULL || out->rtnl_link_ops == NULL) + if (!nft_meta_get_eval_kind(priv->key, dest, pkt)) goto err; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_TIME_NS: case NFT_META_TIME_DAY: -- cgit v1.2.3 From 8724e819cc9a8fab4c18e791cb0bd602fb294971 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:18 +0100 Subject: netfilter: nft_meta: move all interface related keys to helper Reduces repetiveness and reduces size of meta eval function. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 95 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2f7cc64b0c15..022f1473ddd1 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -199,13 +199,79 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, return true; } +static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) +{ + *dest = dev ? dev->ifindex : 0; +} + +static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) +{ + strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); +} + +static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + nft_reg_store16(dest, dev->type); + return true; +} + +static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + *dest = dev->group; + return true; +} + +static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, + const struct nft_pktinfo *pkt) +{ + switch (key) { + case NFT_META_IIFNAME: + nft_meta_store_ifname(dest, nft_in(pkt)); + break; + case NFT_META_OIFNAME: + nft_meta_store_ifname(dest, nft_out(pkt)); + break; + case NFT_META_IIF: + nft_meta_store_ifindex(dest, nft_in(pkt)); + break; + case NFT_META_OIF: + nft_meta_store_ifindex(dest, nft_out(pkt)); + break; + case NFT_META_IIFTYPE: + if (!nft_meta_store_iftype(dest, nft_in(pkt))) + return false; + break; + case NFT_META_OIFTYPE: + if (!nft_meta_store_iftype(dest, nft_out(pkt))) + return false; + break; + case NFT_META_IIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + case NFT_META_OIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + default: + return false; + } + + return true; +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -230,26 +296,15 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->mark; break; case NFT_META_IIF: - *dest = in ? in->ifindex : 0; - break; case NFT_META_OIF: - *dest = out ? out->ifindex : 0; - break; case NFT_META_IIFNAME: - strncpy((char *)dest, in ? in->name : "", IFNAMSIZ); - break; case NFT_META_OIFNAME: - strncpy((char *)dest, out ? out->name : "", IFNAMSIZ); - break; case NFT_META_IIFTYPE: - if (in == NULL) - goto err; - nft_reg_store16(dest, in->type); - break; case NFT_META_OIFTYPE: - if (out == NULL) + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: + if (!nft_meta_get_eval_ifname(priv->key, dest, pkt)) goto err; - nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: case NFT_META_SKGID: @@ -283,16 +338,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_CPU: *dest = raw_smp_processor_id(); break; - case NFT_META_IIFGROUP: - if (in == NULL) - goto err; - *dest = in->group; - break; - case NFT_META_OIFGROUP: - if (out == NULL) - goto err; - *dest = out->group; - break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: if (!nft_meta_get_eval_cgroup(dest, pkt)) -- cgit v1.2.3 From 6b2faee0ca91b63bd5a3b2087d4f25c76b983961 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:19 +0100 Subject: netfilter: nft_meta: place prandom handling in a helper Move this out of the main eval loop, the numgen expression provides a better alternative to meta random. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 022f1473ddd1..ac6fc95387dc 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -266,6 +266,13 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return true; } +static noinline u32 nft_prandom_u32(void) +{ + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + + return prandom_u32_state(state); +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -344,11 +351,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; break; #endif - case NFT_META_PRANDOM: { - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - *dest = prandom_u32_state(state); + case NFT_META_PRANDOM: + *dest = nft_prandom_u32(); break; - } #ifdef CONFIG_XFRM case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); -- cgit v1.2.3 From 01a0fc82252d82eda50d4e1252b826a3ef7afb3d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:20 +0100 Subject: netfilter: nft_meta: place rtclassid handling in a helper skb_dst is an inline helper with a WARN_ON(), so this is a bit more code than it looks like. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_meta.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ac6fc95387dc..fb1a571db924 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -273,6 +273,20 @@ static noinline u32 nft_prandom_u32(void) return prandom_u32_state(state); } +#ifdef CONFIG_IP_ROUTE_CLASSID +static noinline bool +nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) +{ + const struct dst_entry *dst = skb_dst(skb); + + if (!dst) + return false; + + *dest = dst->tclassid; + return true; +} +#endif + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -319,14 +333,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; break; #ifdef CONFIG_IP_ROUTE_CLASSID - case NFT_META_RTCLASSID: { - const struct dst_entry *dst = skb_dst(skb); - - if (dst == NULL) + case NFT_META_RTCLASSID: + if (!nft_meta_get_eval_rtclassid(skb, dest)) goto err; - *dest = dst->tclassid; break; - } #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: -- cgit v1.2.3 From c14ceb0ec727187f71a487a592ffa91767fed66e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Dec 2019 12:05:21 +0100 Subject: netfilter: nft_meta: add support for slave device ifindex matching Allow to match on vrf slave ifindex or name. In case there was no slave interface involved, store 0 in the destination register just like existing iif/oif matching. sdif(name) is restricted to the ipv4/ipv6 input and forward hooks, as it depends on ip(6) stack parsing/storing info in skb->cb[]. Cc: Martin Willi Cc: David Ahern Cc: Shrijeet Mukherjee Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++ net/netfilter/nft_meta.c | 76 +++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index bb9b049310df..e237ecbdcd8a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -805,6 +805,8 @@ enum nft_exthdr_attributes { * @NFT_META_TIME_NS: time since epoch (in nanoseconds) * @NFT_META_TIME_DAY: day of week (from 0 = Sunday to 6 = Saturday) * @NFT_META_TIME_HOUR: hour of day (in seconds) + * @NFT_META_SDIF: slave device interface index + * @NFT_META_SDIFNAME: slave device interface name */ enum nft_meta_keys { NFT_META_LEN, @@ -840,6 +842,8 @@ enum nft_meta_keys { NFT_META_TIME_NS, NFT_META_TIME_DAY, NFT_META_TIME_HOUR, + NFT_META_SDIF, + NFT_META_SDIFNAME, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index fb1a571db924..951b6e87ed5d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include /* for TCP_TIME_WAIT */ #include @@ -287,6 +288,28 @@ nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) } #endif +static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return inet_sdif(pkt->skb); + case NFPROTO_IPV6: + return inet6_sdif(pkt->skb); + } + + return 0; +} + +static noinline void +nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt) +{ + u32 sdif = nft_meta_get_eval_sdif(pkt); + const struct net_device *dev; + + dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL; + nft_meta_store_ifname(dest, dev); +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -379,6 +402,12 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_TIME_HOUR: nft_meta_get_eval_time(priv->key, dest); break; + case NFT_META_SDIF: + *dest = nft_meta_get_eval_sdif(pkt); + break; + case NFT_META_SDIFNAME: + nft_meta_get_eval_sdifname(dest, pkt); + break; default: WARN_ON(1); goto err; @@ -459,6 +488,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: + case NFT_META_SDIF: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID @@ -480,6 +510,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: case NFT_META_IIFKIND: case NFT_META_OIFKIND: + case NFT_META_SDIFNAME: len = IFNAMSIZ; break; case NFT_META_PRANDOM: @@ -510,16 +541,28 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx) { -#ifdef CONFIG_XFRM - const struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; - if (priv->key != NFT_META_SECPATH) - return 0; + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + +static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) +{ +#ifdef CONFIG_XFRM + unsigned int hooks; switch (ctx->family) { case NFPROTO_NETDEV: @@ -542,6 +585,25 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + switch (priv->key) { + case NFT_META_SECPATH: + return nft_meta_get_validate_xfrm(ctx); + case NFT_META_SDIF: + case NFT_META_SDIFNAME: + return nft_meta_get_validate_sdif(ctx); + default: + break; + } + + return 0; +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) -- cgit v1.2.3 From f643ee295c1c63bc117fb052d4da681354d6f732 Mon Sep 17 00:00:00 2001 From: Kevin Kou Date: Thu, 26 Dec 2019 12:29:17 +0000 Subject: sctp: move trace_sctp_probe_path into sctp_outq_sack The original patch bringed in the "SCTP ACK tracking trace event" feature was committed at Dec.20, 2017, it replaced jprobe usage with trace events, and bringed in two trace events, one is TRACE_EVENT(sctp_probe), another one is TRACE_EVENT(sctp_probe_path). The original patch intended to trigger the trace_sctp_probe_path in TRACE_EVENT(sctp_probe) as below code, +TRACE_EVENT(sctp_probe, + + TP_PROTO(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk), + + TP_ARGS(ep, asoc, chunk), + + TP_STRUCT__entry( + __field(__u64, asoc) + __field(__u32, mark) + __field(__u16, bind_port) + __field(__u16, peer_port) + __field(__u32, pathmtu) + __field(__u32, rwnd) + __field(__u16, unack_data) + ), + + TP_fast_assign( + struct sk_buff *skb = chunk->skb; + + __entry->asoc = (unsigned long)asoc; + __entry->mark = skb->mark; + __entry->bind_port = ep->base.bind_addr.port; + __entry->peer_port = asoc->peer.port; + __entry->pathmtu = asoc->pathmtu; + __entry->rwnd = asoc->peer.rwnd; + __entry->unack_data = asoc->unack_data; + + if (trace_sctp_probe_path_enabled()) { + struct sctp_transport *sp; + + list_for_each_entry(sp, &asoc->peer.transport_addr_list, + transports) { + trace_sctp_probe_path(sp, asoc); + } + } + ), But I found it did not work when I did testing, and trace_sctp_probe_path had no output, I finally found that there is trace buffer lock operation(trace_event_buffer_reserve) in include/trace/trace_events.h: static notrace void \ trace_event_raw_event_##call(void *__data, proto) \ { \ struct trace_event_file *trace_file = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ struct trace_event_buffer fbuffer; \ struct trace_event_raw_##call *entry; \ int __data_size; \ \ if (trace_trigger_soft_disabled(trace_file)) \ return; \ \ __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ \ entry = trace_event_buffer_reserve(&fbuffer, trace_file, \ sizeof(*entry) + __data_size); \ \ if (!entry) \ return; \ \ tstruct \ \ { assign; } \ \ trace_event_buffer_commit(&fbuffer); \ } The reason caused no output of trace_sctp_probe_path is that trace_sctp_probe_path written in TP_fast_assign part of TRACE_EVENT(sctp_probe), and it will be placed( { assign; } ) after the trace_event_buffer_reserve() when compiler expands Macro, entry = trace_event_buffer_reserve(&fbuffer, trace_file, \ sizeof(*entry) + __data_size); \ \ if (!entry) \ return; \ \ tstruct \ \ { assign; } \ so trace_sctp_probe_path finally can not acquire trace_event_buffer and return no output, that is to say the nest of tracepoint entry function is not allowed. The function call flow is: trace_sctp_probe() -> trace_event_raw_event_sctp_probe() -> lock buffer -> trace_sctp_probe_path() -> trace_event_raw_event_sctp_probe_path() --nested -> buffer has been locked and return no output. This patch is to remove trace_sctp_probe_path from the TP_fast_assign part of TRACE_EVENT(sctp_probe) to avoid the nest of entry function, and trigger sctp_probe_path_trace in sctp_outq_sack. After this patch, you can enable both events individually, # cd /sys/kernel/debug/tracing # echo 1 > events/sctp/sctp_probe/enable # echo 1 > events/sctp/sctp_probe_path/enable Or, you can enable all the events under sctp. # echo 1 > events/sctp/enable Signed-off-by: Kevin Kou Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/trace/events/sctp.h | 9 --------- net/sctp/outqueue.c | 6 ++++++ 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h index 7475c7be165a..d4aac3436595 100644 --- a/include/trace/events/sctp.h +++ b/include/trace/events/sctp.h @@ -75,15 +75,6 @@ TRACE_EVENT(sctp_probe, __entry->pathmtu = asoc->pathmtu; __entry->rwnd = asoc->peer.rwnd; __entry->unack_data = asoc->unack_data; - - if (trace_sctp_probe_path_enabled()) { - struct sctp_transport *sp; - - list_for_each_entry(sp, &asoc->peer.transport_addr_list, - transports) { - trace_sctp_probe_path(sp, asoc); - } - } ), TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d " diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index a031d1134c60..6b0b3bad4daa 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -36,6 +36,7 @@ #include #include #include +#include /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -1238,6 +1239,11 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; + /* SCTP path tracepoint for congestion control debugging. */ + list_for_each_entry(transport, transport_list, transports) { + trace_sctp_probe_path(transport, asoc); + } + sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; -- cgit v1.2.3 From 0914d2bb11cc182039084ac3b961dc359b647468 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Mon, 23 Dec 2019 18:42:57 +0800 Subject: af_packet: refactoring code for prb_calc_retire_blk_tmo If __ethtool_get_link_ksettings() is failed and with non-zero value, prb_calc_retire_blk_tmo() should return DEFAULT_PRB_RETIRE_TOV firstly. This patch is to refactory code and make it more readable. Signed-off-by: Mao Wenan Signed-off-by: David S. Miller --- net/packet/af_packet.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 118cd66b7516..3bec515ccde3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -520,7 +520,7 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; - unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; + unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; @@ -532,31 +532,25 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); - if (!err) { - /* - * If the link speed is so slow you don't really - * need to worry about perf anyways - */ - if (ecmd.base.speed < SPEED_1000 || - ecmd.base.speed == SPEED_UNKNOWN) { - return DEFAULT_PRB_RETIRE_TOV; - } else { - msec = 1; - div = ecmd.base.speed / 1000; - } - } else + if (err) return DEFAULT_PRB_RETIRE_TOV; + /* If the link speed is so slow you don't really + * need to worry about perf anyways + */ + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) + return DEFAULT_PRB_RETIRE_TOV; + + div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; - tmo = mbits * msec; - if (div) - return tmo+1; - return tmo; + return mbits + 1; + return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, -- cgit v1.2.3 From 473900a504e510cf9175876de8892ad1e3e7efab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:50 -0800 Subject: tcp_cubic: optimize hystart_update() We do not care which bit in ca->found is set. We avoid accessing hystart and hystart_detect unless really needed, possibly avoiding one cache line miss. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1b3d032a4df2..297936033c33 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -381,9 +381,6 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (ca->found & hystart_detect) - return; - if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); @@ -391,7 +388,7 @@ static void hystart_update(struct sock *sk, u32 delay) if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { - ca->found |= HYSTART_ACK_TRAIN; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -412,7 +409,7 @@ static void hystart_update(struct sock *sk, u32 delay) } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { - ca->found |= HYSTART_DELAY; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), @@ -450,7 +447,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tcp_in_slow_start(tp) && + if (!ca->found && hystart && tcp_in_slow_start(tp) && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } -- cgit v1.2.3 From 35821fc2b41c51161e503bade3630603668dd3af Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:51 -0800 Subject: tcp_cubic: remove one conditional from hystart_update() If we initialize ca->curr_rtt to ~0U, we do not need to test for zero value in hystart_update() We only read ca->curr_rtt if at least HYSTART_MIN_SAMPLES have been processed, and thus ca->curr_rtt will have a sane value. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 297936033c33..5eff762acd7f 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -133,7 +133,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) ca->round_start = ca->last_ack = bictcp_clock(); ca->end_seq = tp->snd_nxt; - ca->curr_rtt = 0; + ca->curr_rtt = ~0U; ca->sample_cnt = 0; } @@ -402,7 +402,7 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + if (ca->curr_rtt > delay) ca->curr_rtt = delay; ca->sample_cnt++; -- cgit v1.2.3 From cff04e2da308c522f654237b45dd64248fe8d1fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:52 -0800 Subject: tcp_cubic: switch bictcp_clock() to usec resolution Current 1ms clock feeds ca->round_start, ca->delay_min, ca->last_ack. This is quite problematic for data-center flows, where delay_min is way below 1 ms. This means Hystart Train detection triggers every time jiffies value is updated, since "((s32)(now - ca->round_start) > ca->delay_min >> 4)" expression becomes true. This kind of random behavior can be solved by reusing the existing usec timestamp that TCP keeps in tp->tcp_mstamp Note that a followup patch will tweak things a bit, because during slow start, GRO aggregation on receivers naturally increases the RTT as TSO packets gradually come to ~64KB size. To recap, right after this patch CUBIC Hystart train detection is more aggressive, since short RTT flows might exit slow start at cwnd = 20, instead of being possibly unbounded. Following patch will address this problem. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5eff762acd7f..068775b91fb5 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -40,8 +40,8 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (4U<<3) -#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; -static int hystart_ack_delta __read_mostly = 2; +static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); -module_param(hystart_ack_delta, int, 0644); -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); +module_param(hystart_ack_delta_us, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -89,7 +89,7 @@ struct bictcp { u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay (msec << 3) */ + u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } -static inline u32 bictcp_clock(void) +static inline u32 bictcp_clock_us(const struct sock *sk) { -#if HZ < 1000 - return ktime_to_ms(ktime_get_real()); -#else - return jiffies_to_msecs(jiffies); -#endif + return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) @@ -131,7 +127,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_ack = bictcp_clock(); + ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; ca->curr_rtt = ~0U; ca->sample_cnt = 0; @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) */ t = (s32)(tcp_jiffies32 - ca->epoch_start); - t += msecs_to_jiffies(ca->delay_min >> 3); + t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); @@ -382,12 +378,12 @@ static void hystart_update(struct sock *sk, u32 delay) struct bictcp *ca = inet_csk_ca(sk); if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now = bictcp_clock(); + u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { + if ((s32)(now - ca->round_start) > ca->delay_min >> 1) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); @@ -421,9 +417,6 @@ static void hystart_update(struct sock *sk, u32 delay) } } -/* Track delayed acknowledgment ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -438,7 +431,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; + delay = sample->rtt_us; if (delay == 0) delay = 1; -- cgit v1.2.3 From 42f3a8aaae66d31d87850fb4b02979a0fc5dc541 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:53 -0800 Subject: tcp_cubic: tweak Hystart detection for short RTT flows After switching ca->delay_min to usec resolution, we exit slow start prematurely for very low RTT flows, setting snd_ssthresh to 20. The reason is that delay_min is fed with RTT of small packet trains. Then as cwnd is increased, TCP sends bigger TSO packets. LRO/GRO aggregation and/or interrupt mitigation strategies on receiver tend to inflate RTT samples. Fix this by adding to delay_min the expected delay of two TSO packets, given current pacing rate. Tested: Sender uses pfifo_fast qdisc Before : $ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 11348 11707 11562 11428 11773 11534 9878 11693 10597 10968 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 200 0.0 After : $ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 14877 14517 15797 18466 17376 14833 17558 17933 16039 18059 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 1670 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 068775b91fb5..0e5428ed04fe 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -436,8 +436,27 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) delay = 1; /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) - ca->delay_min = delay; + if (ca->delay_min == 0 || ca->delay_min > delay) { + unsigned long rate = READ_ONCE(sk->sk_pacing_rate); + + /* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, + * since during slow start we begin with small TSO packets + * and could lower ca->delay_min too much. + * Ideally even with a very small RTT we would like to have + * at least one TSO packet being sent and received by GRO, + * and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at + * this point. + * We cap the cushion to 1ms. + */ + if (rate) + delay += min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * + 4 * USEC_PER_SEC, rate)); + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; + } /* hystart triggers when cwnd is larger than some threshold */ if (!ca->found && hystart && tcp_in_slow_start(tp) && -- cgit v1.2.3 From ede656e8465839530c3287c7f54adf75dc2b9563 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Dec 2019 12:27:54 -0800 Subject: tcp_cubic: make Hystart aware of pacing For years we disabled Hystart ACK train detection at Google because it was fooled by TCP pacing. ACK train detection uses a simple heuristic, detecting if we receive ACK past half the RTT, to exit slow start before hitting the bottleneck and experience massive drops. But pacing by design might delay packets up to RTT/2, so we need to tweak the Hystart logic to be aware of this extra delay. Tested: Added a 100 usec delay at receiver. Before: nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 9117 7057 9553 8300 7030 6849 9533 10126 6876 8473 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 1230 0.0 After : nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 9845 10103 10866 11096 11936 11487 11773 12188 11066 11894 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 6462 0.0 Disabling Hystart ACK Train detection gives similar numbers echo 2 >/sys/module/tcp_cubic/parameters/hystart_detect nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 11173 10954 12455 10627 11578 11583 11222 10880 10665 11366 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0e5428ed04fe..d02bb283c689 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -376,6 +376,7 @@ static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); + u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); @@ -383,7 +384,17 @@ static void hystart_update(struct sock *sk, u32 delay) /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 1) { + + threshold = ca->delay_min; + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((s32)(now - ca->round_start) > threshold) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); -- cgit v1.2.3 From 356b23c073dd063427102329b296061855b912d9 Mon Sep 17 00:00:00 2001 From: Kevin Kou Date: Wed, 25 Dec 2019 08:27:25 +0000 Subject: sctp: do trace_sctp_probe after SACK validation and check The function sctp_sf_eat_sack_6_2 now performs the Verification Tag validation, Chunk length validation, Bogu check, and also the detection of out-of-order SACK based on the RFC2960 Section 6.2 at the beginning, and finally performs the further processing of SACK. The trace_sctp_probe now triggered before the above necessary validation and check. this patch is to do the trace_sctp_probe after the chunk sanity tests, but keep doing trace if the SACK received is out of order, for the out-of-order SACK is valuable to congestion control debugging. v1->v2: - keep doing SCTP trace if the SACK is out of order as Marcelo's suggestion. v2->v3: - regenerate the patch as v2 generated on top of v1, and add 'net-next' tag to the new one as Marcelo's comments. Signed-off-by: Kevin Kou Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 42558fa3f3e4..748e3b19ec1d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3281,8 +3281,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, struct sctp_sackhdr *sackh; __u32 ctsn; - trace_sctp_probe(ep, asoc, chunk); - if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -3299,6 +3297,15 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, chunk->subh.sack_hdr = sackh; ctsn = ntohl(sackh->cum_tsn_ack); + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (TSN_lte(asoc->next_tsn, ctsn)) + return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); + + trace_sctp_probe(ep, asoc, chunk); + /* i) If Cumulative TSN Ack is less than the Cumulative TSN * Ack Point, then drop the SACK. Since Cumulative TSN * Ack is monotonically increasing, a SACK whose @@ -3312,13 +3319,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, return SCTP_DISPOSITION_DISCARD; } - /* If Cumulative TSN Ack beyond the max tsn currently - * send, terminating the association and respond to the - * sender with an ABORT. - */ - if (!TSN_lt(ctsn, asoc->next_tsn)) - return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands); - /* Return this SACK for further processing. */ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); -- cgit v1.2.3 From 2b4a8990b7df55875745a80a609a1ceaaf51f322 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:18 +0100 Subject: ethtool: introduce ethtool netlink interface Basic genetlink and init infrastructure for the netlink interface, register genetlink family "ethtool". Add CONFIG_ETHTOOL_NETLINK Kconfig option to make the build optional. Add initial overall interface description into Documentation/networking/ethtool-netlink.rst, further patches will add more detailed information. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 216 +++++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + include/linux/ethtool_netlink.h | 9 ++ include/uapi/linux/ethtool_netlink.h | 36 +++++ net/Kconfig | 8 + net/ethtool/Makefile | 6 +- net/ethtool/netlink.c | 33 ++++ net/ethtool/netlink.h | 10 ++ 8 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 Documentation/networking/ethtool-netlink.rst create mode 100644 include/linux/ethtool_netlink.h create mode 100644 include/uapi/linux/ethtool_netlink.h create mode 100644 net/ethtool/netlink.c create mode 100644 net/ethtool/netlink.h (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst new file mode 100644 index 000000000000..9448442ad293 --- /dev/null +++ b/Documentation/networking/ethtool-netlink.rst @@ -0,0 +1,216 @@ +============================= +Netlink interface for ethtool +============================= + + +Basic information +================= + +Netlink interface for ethtool uses generic netlink family ``ethtool`` +(userspace application should use macros ``ETHTOOL_GENL_NAME`` and +``ETHTOOL_GENL_VERSION`` defined in ```` uapi +header). This family does not use a specific header, all information in +requests and replies is passed using netlink attributes. + +The ethtool netlink interface uses extended ACK for error and warning +reporting, userspace application developers are encouraged to make these +messages available to user in a suitable way. + +Requests can be divided into three categories: "get" (retrieving information), +"set" (setting parameters) and "action" (invoking an action). + +All "set" and "action" type requests require admin privileges +(``CAP_NET_ADMIN`` in the namespace). Most "get" type requests are allowed for +anyone but there are exceptions (where the response contains sensitive +information). In some cases, the request as such is allowed for anyone but +unprivileged users have attributes with sensitive information (e.g. +wake-on-lan password) omitted. + + +Conventions +=========== + +Attributes which represent a boolean value usually use NLA_U8 type so that we +can distinguish three states: "on", "off" and "not present" (meaning the +information is not available in "get" requests or value is not to be changed +in "set" requests). For these attributes, the "true" value should be passed as +number 1 but any non-zero value should be understood as "true" by recipient. +In the tables below, "bool" denotes NLA_U8 attributes interpreted in this way. + +In the message structure descriptions below, if an attribute name is suffixed +with "+", parent nest can contain multiple attributes of the same type. This +implements an array of entries. + + +Request header +============== + +Each request or reply message contains a nested attribute with common header. +Structure of this header is + + ============================== ====== ============================= + ``ETHTOOL_A_HEADER_DEV_INDEX`` u32 device ifindex + ``ETHTOOL_A_HEADER_DEV_NAME`` string device name + ``ETHTOOL_A_HEADER_FLAGS`` u32 flags common for all requests + ============================== ====== ============================= + +``ETHTOOL_A_HEADER_DEV_INDEX`` and ``ETHTOOL_A_HEADER_DEV_NAME`` identify the +device message relates to. One of them is sufficient in requests, if both are +used, they must identify the same device. Some requests, e.g. global string +sets, do not require device identification. Most ``GET`` requests also allow +dump requests without device identification to query the same information for +all devices providing it (each device in a separate message). + +``ETHTOOL_A_HEADER_FLAGS`` is a bitmap of request flags common for all request +types. The interpretation of these flags is the same for all request types but +the flags may not apply to requests. Recognized flags are: + + ================================= =================================== + ``ETHTOOL_FLAG_COMPACT_BITSETS`` use compact format bitsets in reply + ``ETHTOOL_FLAG_OMIT_REPLY`` omit optional reply (_SET and _ACT) + ================================= =================================== + +New request flags should follow the general idea that if the flag is not set, +the behaviour is backward compatible, i.e. requests from old clients not aware +of the flag should be interpreted the way the client expects. A client must +not set flags it does not understand. + + +List of message types +===================== + +All constants identifying message types use ``ETHTOOL_CMD_`` prefix and suffix +according to message purpose: + + ============== ====================================== + ``_GET`` userspace request to retrieve data + ``_SET`` userspace request to set data + ``_ACT`` userspace request to perform an action + ``_GET_REPLY`` kernel reply to a ``GET`` request + ``_SET_REPLY`` kernel reply to a ``SET`` request + ``_ACT_REPLY`` kernel reply to an ``ACT`` request + ``_NTF`` kernel notification + ============== ====================================== + +``GET`` requests are sent by userspace applications to retrieve device +information. They usually do not contain any message specific attributes. +Kernel replies with corresponding "GET_REPLY" message. For most types, ``GET`` +request with ``NLM_F_DUMP`` and no device identification can be used to query +the information for all devices supporting the request. + +If the data can be also modified, corresponding ``SET`` message with the same +layout as corresponding ``GET_REPLY`` is used to request changes. Only +attributes where a change is requested are included in such request (also, not +all attributes may be changed). Replies to most ``SET`` request consist only +of error code and extack; if kernel provides additional data, it is sent in +the form of corresponding ``SET_REPLY`` message which can be suppressed by +setting ``ETHTOOL_FLAG_OMIT_REPLY`` flag in request header. + +Data modification also triggers sending a ``NTF`` message with a notification. +These usually bear only a subset of attributes which was affected by the +change. The same notification is issued if the data is modified using other +means (mostly ioctl ethtool interface). Unlike notifications from ethtool +netlink code which are only sent if something actually changed, notifications +triggered by ioctl interface may be sent even if the request did not actually +change any data. + +``ACT`` messages request kernel (driver) to perform a specific action. If some +information is reported by kernel (which can be suppressed by setting +``ETHTOOL_FLAG_OMIT_REPLY`` flag in request header), the reply takes form of +an ``ACT_REPLY`` message. Performing an action also triggers a notification +(``NTF`` message). + +Later sections describe the format and semantics of these messages. + + +Request translation +=================== + +The following table maps ioctl commands to netlink commands providing their +functionality. Entries with "n/a" in right column are commands which do not +have their netlink replacement yet. + + =================================== ===================================== + ioctl command netlink command + =================================== ===================================== + ``ETHTOOL_GSET`` n/a + ``ETHTOOL_SSET`` n/a + ``ETHTOOL_GDRVINFO`` n/a + ``ETHTOOL_GREGS`` n/a + ``ETHTOOL_GWOL`` n/a + ``ETHTOOL_SWOL`` n/a + ``ETHTOOL_GMSGLVL`` n/a + ``ETHTOOL_SMSGLVL`` n/a + ``ETHTOOL_NWAY_RST`` n/a + ``ETHTOOL_GLINK`` n/a + ``ETHTOOL_GEEPROM`` n/a + ``ETHTOOL_SEEPROM`` n/a + ``ETHTOOL_GCOALESCE`` n/a + ``ETHTOOL_SCOALESCE`` n/a + ``ETHTOOL_GRINGPARAM`` n/a + ``ETHTOOL_SRINGPARAM`` n/a + ``ETHTOOL_GPAUSEPARAM`` n/a + ``ETHTOOL_SPAUSEPARAM`` n/a + ``ETHTOOL_GRXCSUM`` n/a + ``ETHTOOL_SRXCSUM`` n/a + ``ETHTOOL_GTXCSUM`` n/a + ``ETHTOOL_STXCSUM`` n/a + ``ETHTOOL_GSG`` n/a + ``ETHTOOL_SSG`` n/a + ``ETHTOOL_TEST`` n/a + ``ETHTOOL_GSTRINGS`` n/a + ``ETHTOOL_PHYS_ID`` n/a + ``ETHTOOL_GSTATS`` n/a + ``ETHTOOL_GTSO`` n/a + ``ETHTOOL_STSO`` n/a + ``ETHTOOL_GPERMADDR`` rtnetlink ``RTM_GETLINK`` + ``ETHTOOL_GUFO`` n/a + ``ETHTOOL_SUFO`` n/a + ``ETHTOOL_GGSO`` n/a + ``ETHTOOL_SGSO`` n/a + ``ETHTOOL_GFLAGS`` n/a + ``ETHTOOL_SFLAGS`` n/a + ``ETHTOOL_GPFLAGS`` n/a + ``ETHTOOL_SPFLAGS`` n/a + ``ETHTOOL_GRXFH`` n/a + ``ETHTOOL_SRXFH`` n/a + ``ETHTOOL_GGRO`` n/a + ``ETHTOOL_SGRO`` n/a + ``ETHTOOL_GRXRINGS`` n/a + ``ETHTOOL_GRXCLSRLCNT`` n/a + ``ETHTOOL_GRXCLSRULE`` n/a + ``ETHTOOL_GRXCLSRLALL`` n/a + ``ETHTOOL_SRXCLSRLDEL`` n/a + ``ETHTOOL_SRXCLSRLINS`` n/a + ``ETHTOOL_FLASHDEV`` n/a + ``ETHTOOL_RESET`` n/a + ``ETHTOOL_SRXNTUPLE`` n/a + ``ETHTOOL_GRXNTUPLE`` n/a + ``ETHTOOL_GSSET_INFO`` n/a + ``ETHTOOL_GRXFHINDIR`` n/a + ``ETHTOOL_SRXFHINDIR`` n/a + ``ETHTOOL_GFEATURES`` n/a + ``ETHTOOL_SFEATURES`` n/a + ``ETHTOOL_GCHANNELS`` n/a + ``ETHTOOL_SCHANNELS`` n/a + ``ETHTOOL_SET_DUMP`` n/a + ``ETHTOOL_GET_DUMP_FLAG`` n/a + ``ETHTOOL_GET_DUMP_DATA`` n/a + ``ETHTOOL_GET_TS_INFO`` n/a + ``ETHTOOL_GMODULEINFO`` n/a + ``ETHTOOL_GMODULEEEPROM`` n/a + ``ETHTOOL_GEEE`` n/a + ``ETHTOOL_SEEE`` n/a + ``ETHTOOL_GRSSH`` n/a + ``ETHTOOL_SRSSH`` n/a + ``ETHTOOL_GTUNABLE`` n/a + ``ETHTOOL_STUNABLE`` n/a + ``ETHTOOL_GPHYSTATS`` n/a + ``ETHTOOL_PERQUEUE`` n/a + ``ETHTOOL_GLINKSETTINGS`` n/a + ``ETHTOOL_SLINKSETTINGS`` n/a + ``ETHTOOL_PHY_GTUNABLE`` n/a + ``ETHTOOL_PHY_STUNABLE`` n/a + ``ETHTOOL_GFECPARAM`` n/a + ``ETHTOOL_SFECPARAM`` n/a + =================================== ===================================== diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5acab1290e03..bee73be7af93 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -16,6 +16,7 @@ Contents: devlink-info-versions devlink-trap devlink-trap-netdevsim + ethtool-netlink ieee802154 j1939 kapi diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h new file mode 100644 index 000000000000..f27e92b5f344 --- /dev/null +++ b/include/linux/ethtool_netlink.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _LINUX_ETHTOOL_NETLINK_H_ +#define _LINUX_ETHTOOL_NETLINK_H_ + +#include +#include + +#endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h new file mode 100644 index 000000000000..3c93276ba066 --- /dev/null +++ b/include/uapi/linux/ethtool_netlink.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * include/uapi/linux/ethtool_netlink.h - netlink interface for ethtool + * + * See Documentation/networking/ethtool-netlink.txt in kernel source tree for + * doucumentation of the interface. + */ + +#ifndef _UAPI_LINUX_ETHTOOL_NETLINK_H_ +#define _UAPI_LINUX_ETHTOOL_NETLINK_H_ + +#include + +/* message types - userspace to kernel */ +enum { + ETHTOOL_MSG_USER_NONE, + + /* add new constants above here */ + __ETHTOOL_MSG_USER_CNT, + ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 +}; + +/* message types - kernel to userspace */ +enum { + ETHTOOL_MSG_KERNEL_NONE, + + /* add new constants above here */ + __ETHTOOL_MSG_KERNEL_CNT, + ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 +}; + +/* generic netlink info */ +#define ETHTOOL_GENL_NAME "ethtool" +#define ETHTOOL_GENL_VERSION 1 + +#endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/net/Kconfig b/net/Kconfig index 52af65e5d28c..54916b7adb9b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -449,6 +449,14 @@ config FAILOVER migration of VMs with direct attached VFs by failing over to the paravirtual datapath when the VF is unplugged. +config ETHTOOL_NETLINK + bool "Netlink interface for ethtool" + default y + help + An alternative userspace interface for ethtool based on generic + netlink. It provides better extensibility and some new features, + e.g. notification messages. + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index f68387618973..59d5ee230c29 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -1,3 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += ioctl.o common.o +obj-y += ioctl.o common.o + +obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o + +ethtool_nl-y := netlink.o diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c new file mode 100644 index 000000000000..59e1ebde2f15 --- /dev/null +++ b/net/ethtool/netlink.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include "netlink.h" + +/* genetlink setup */ + +static const struct genl_ops ethtool_genl_ops[] = { +}; + +static struct genl_family ethtool_genl_family = { + .name = ETHTOOL_GENL_NAME, + .version = ETHTOOL_GENL_VERSION, + .netnsok = true, + .parallel_ops = true, + .ops = ethtool_genl_ops, + .n_ops = ARRAY_SIZE(ethtool_genl_ops), +}; + +/* module setup */ + +static int __init ethnl_init(void) +{ + int ret; + + ret = genl_register_family(ðtool_genl_family); + if (WARN(ret < 0, "ethtool: genetlink family registration failed")) + return ret; + + return 0; +} + +subsys_initcall(ethnl_init); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h new file mode 100644 index 000000000000..e4220780d368 --- /dev/null +++ b/net/ethtool/netlink.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_NETLINK_H +#define _NET_ETHTOOL_NETLINK_H + +#include +#include +#include + +#endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 041b1c5d4a53e97fc9e029ae32469552ca12cb9b Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:23 +0100 Subject: ethtool: helper functions for netlink interface Add common request/reply header definition and helpers to parse request header and fill reply header. Provide ethnl_update_* helpers to update structure members from request attributes (to be used for *_SET requests). Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 21 ++++ net/ethtool/netlink.c | 166 ++++++++++++++++++++++++++++ net/ethtool/netlink.h | 205 +++++++++++++++++++++++++++++++++++ 3 files changed, 392 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 3c93276ba066..82fc3b5f41c9 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -29,6 +29,27 @@ enum { ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 }; +/* request header */ + +/* use compact bitsets in reply */ +#define ETHTOOL_FLAG_COMPACT_BITSETS (1 << 0) +/* provide optional reply for SET or ACT requests */ +#define ETHTOOL_FLAG_OMIT_REPLY (1 << 1) + +#define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ + ETHTOOL_FLAG_OMIT_REPLY) + +enum { + ETHTOOL_A_HEADER_UNSPEC, + ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ + ETHTOOL_A_HEADER_DEV_NAME, /* string */ + ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ + + /* add new constants above here */ + __ETHTOOL_A_HEADER_CNT, + ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 59e1ebde2f15..aef882e0c3f5 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1,8 +1,174 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include "netlink.h" +static struct genl_family ethtool_genl_family; + +static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = { + [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = { .type = NLA_U32 }, +}; + +/** + * ethnl_parse_header() - parse request header + * @req_info: structure to put results into + * @header: nest attribute with request header + * @net: request netns + * @extack: netlink extack for error reporting + * @require_dev: fail if no device identified in header + * + * Parse request header in nested attribute @nest and puts results into + * the structure pointed to by @req_info. Extack from @info is used for error + * reporting. If req_info->dev is not null on return, reference to it has + * been taken. If error is returned, *req_info is null initialized and no + * reference is held. + * + * Return: 0 on success or negative error code + */ +int ethnl_parse_header(struct ethnl_req_info *req_info, + const struct nlattr *header, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1]; + const struct nlattr *devname_attr; + struct net_device *dev = NULL; + int ret; + + if (!header) { + NL_SET_ERR_MSG(extack, "request header missing"); + return -EINVAL; + } + ret = nla_parse_nested(tb, ETHTOOL_A_HEADER_MAX, header, + ethnl_header_policy, extack); + if (ret < 0) + return ret; + devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; + + if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { + u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); + + dev = dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_HEADER_DEV_INDEX], + "no device matches ifindex"); + return -ENODEV; + } + /* if both ifindex and ifname are passed, they must match */ + if (devname_attr && + strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { + dev_put(dev); + NL_SET_ERR_MSG_ATTR(extack, header, + "ifindex and name do not match"); + return -ENODEV; + } + } else if (devname_attr) { + dev = dev_get_by_name(net, nla_data(devname_attr)); + if (!dev) { + NL_SET_ERR_MSG_ATTR(extack, devname_attr, + "no device matches name"); + return -ENODEV; + } + } else if (require_dev) { + NL_SET_ERR_MSG_ATTR(extack, header, + "neither ifindex nor name specified"); + return -EINVAL; + } + + if (dev && !netif_device_present(dev)) { + dev_put(dev); + NL_SET_ERR_MSG(extack, "device not present"); + return -ENODEV; + } + + req_info->dev = dev; + if (tb[ETHTOOL_A_HEADER_FLAGS]) + req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); + + return 0; +} + +/** + * ethnl_fill_reply_header() - Put common header into a reply message + * @skb: skb with the message + * @dev: network device to describe in header + * @attrtype: attribute type to use for the nest + * + * Create a nested attribute with attributes describing given network device. + * + * Return: 0 on success, error value (-EMSGSIZE only) on error + */ +int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, + u16 attrtype) +{ + struct nlattr *nest; + + if (!dev) + return 0; + nest = nla_nest_start(skb, attrtype); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || + nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) + goto nla_put_failure; + /* If more attributes are put into reply header, ethnl_header_size() + * must be updated to account for them. + */ + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +/** + * ethnl_reply_init() - Create skb for a reply and fill device identification + * @payload: payload length (without netlink and genetlink header) + * @dev: device the reply is about (may be null) + * @cmd: ETHTOOL_MSG_* message type for reply + * @info: genetlink info of the received packet we respond to + * @ehdrp: place to store payload pointer returned by genlmsg_new() + * + * Return: pointer to allocated skb on success, NULL on error + */ +struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, + u16 hdr_attrtype, struct genl_info *info, + void **ehdrp) +{ + struct sk_buff *skb; + + skb = genlmsg_new(payload, GFP_KERNEL); + if (!skb) + goto err; + *ehdrp = genlmsg_put_reply(skb, info, ðtool_genl_family, 0, cmd); + if (!*ehdrp) + goto err_free; + + if (dev) { + int ret; + + ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); + if (ret < 0) + goto err_free; + } + return skb; + +err_free: + nlmsg_free(skb); +err: + if (info) + GENL_SET_ERR_MSG(info, "failed to setup reply message"); + return NULL; +} + /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index e4220780d368..05d7183da894 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -6,5 +6,210 @@ #include #include #include +#include + +struct ethnl_req_info; + +int ethnl_parse_header(struct ethnl_req_info *req_info, + const struct nlattr *nest, struct net *net, + struct netlink_ext_ack *extack, bool require_dev); +int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, + u16 attrtype); +struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, + u16 hdr_attrtype, struct genl_info *info, + void **ehdrp); + +/** + * ethnl_strz_size() - calculate attribute length for fixed size string + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * + * Return: total length of an attribute with null terminated string from @s + */ +static inline int ethnl_strz_size(const char *s) +{ + return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); +} + +/** + * ethnl_put_strz() - put string attribute with fixed size string + * @skb: skb with the message + * @attrype: attribute type + * @s: ETH_GSTRING_LEN sized string (may not be null terminated) + * + * Puts an attribute with null terminated string from @s into the message. + * + * Return: 0 on success, negative error code on failure + */ +static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, + const char *s) +{ + unsigned int len = strnlen(s, ETH_GSTRING_LEN); + struct nlattr *attr; + + attr = nla_reserve(skb, attrtype, len + 1); + if (!attr) + return -EMSGSIZE; + + memcpy(nla_data(attr), s, len); + ((char *)nla_data(attr))[len] = '\0'; + return 0; +} + +/** + * ethnl_update_u32() - update u32 value from NLA_U32 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Copy the u32 value from NLA_U32 netlink attribute @attr into variable + * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod + * is set to true if this function changed the value of *dst, otherwise it + * is left as is. + */ +static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + u32 val; + + if (!attr) + return; + val = nla_get_u32(attr); + if (*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_u8() - update u8 value from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Copy the u8 value from NLA_U8 netlink attribute @attr into variable + * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod + * is set to true if this function changed the value of *dst, otherwise it + * is left as is. + */ +static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = nla_get_u8(attr); + if (*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable + * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is + * null. Bool pointed to by @mod is set to true if this function changed the + * logical value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + u8 val; + + if (!attr) + return; + val = !!nla_get_u8(attr); + if (!!*dst == val) + return; + + *dst = val; + *mod = true; +} + +/** + * ethnl_update_binary() - update binary data from NLA_BINARY atribute + * @dst: value to update + * @len: destination buffer length + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block + * of length @len at @dst by attribute payload; do nothing if @attr is null. + * Bool pointed to by @mod is set to true if this function changed the logical + * value of *dst, otherwise it is left as is. + */ +static inline void ethnl_update_binary(void *dst, unsigned int len, + const struct nlattr *attr, bool *mod) +{ + if (!attr) + return; + if (nla_len(attr) < len) + len = nla_len(attr); + if (!memcmp(dst, nla_data(attr), len)) + return; + + memcpy(dst, nla_data(attr), len); + *mod = true; +} + +/** + * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute + * @dst: value to update + * @attr: netlink attribute with new value or null + * @mod: pointer to bool for modification tracking + * + * Update bits in u32 value which are set in attribute's mask to values from + * attribute's value. Do nothing if @attr is null or the value wouldn't change; + * otherwise, set bool pointed to by @mod to true. + */ +static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, + bool *mod) +{ + struct nla_bitfield32 change; + u32 newval; + + if (!attr) + return; + change = nla_get_bitfield32(attr); + newval = (*dst & ~change.selector) | (change.value & change.selector); + if (*dst == newval) + return; + + *dst = newval; + *mod = true; +} + +/** + * ethnl_reply_header_size() - total size of reply header + * + * This is an upper estimate so that we do not need to hold RTNL lock longer + * than necessary (to prevent rename between size estimate and composing the + * message). Accounts only for device ifindex and name as those are the only + * attributes ethnl_fill_reply_header() puts into the reply header. + */ +static inline unsigned int ethnl_reply_header_size(void) +{ + return nla_total_size(nla_total_size(sizeof(u32)) + + nla_total_size(IFNAMSIZ)); +} + +/** + * struct ethnl_req_info - base type of request information for GET requests + * @dev: network device the request is for (may be null) + * @flags: request flags common for all request types + * + * This is a common base, additional members may follow after this structure. + */ +struct ethnl_req_info { + struct net_device *dev; + u32 flags; +}; #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 10b518d4e6dd5390e40f7d8de0f08753c1195a7e Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:28 +0100 Subject: ethtool: netlink bitset handling The ethtool netlink code uses common framework for passing arbitrary length bit sets to allow future extensions. A bitset can be a list (only one bitmap) or can consist of value and mask pair (used e.g. when client want to modify only some bits). A bitset can use one of two formats: verbose (bit by bit) or compact. Verbose format consists of bitset size (number of bits), list flag and an array of bit nests, telling which bits are part of the list or which bits are in the mask and which of them are to be set. In requests, bits can be identified by index (position) or by name. In replies, kernel provides both index and name. Verbose format is suitable for "one shot" applications like standard ethtool command as it avoids the need to either keep bit names (e.g. link modes) in sync with kernel or having to add an extra roundtrip for string set request (e.g. for private flags). Compact format uses one (list) or two (value/mask) arrays of 32-bit words to store the bitmap(s). It is more suitable for long running applications (ethtool in monitor mode or network management daemons) which can retrieve the names once and then pass only compact bitmaps to save space. Userspace requests can use either format; ETHTOOL_FLAG_COMPACT_BITSETS flag in request header tells kernel which format to use in reply. Notifications always use compact format. As some code uses arrays of unsigned long for internal representation and some arrays of u32 (or even a single u32), two sets of parse/compose helpers are introduced. To avoid code duplication, helpers for unsigned long arrays are implemented as wrappers around helpers for u32 arrays. There are two reasons for this choice: (1) u32 arrays are more frequent in ethtool code and (2) unsigned long array can be always interpreted as an u32 array on little endian 64-bit and all 32-bit architectures while we would need special handling for odd number of u32 words in the opposite direction. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 84 +++ include/uapi/linux/ethtool_netlink.h | 35 ++ net/ethtool/Makefile | 2 +- net/ethtool/bitset.c | 735 +++++++++++++++++++++++++++ net/ethtool/bitset.h | 28 + 5 files changed, 883 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/bitset.c create mode 100644 net/ethtool/bitset.h (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 9448442ad293..7797f1237472 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -76,6 +76,90 @@ of the flag should be interpreted the way the client expects. A client must not set flags it does not understand. +Bit sets +======== + +For short bitmaps of (reasonably) fixed length, standard ``NLA_BITFIELD32`` +type is used. For arbitrary length bitmaps, ethtool netlink uses a nested +attribute with contents of one of two forms: compact (two binary bitmaps +representing bit values and mask of affected bits) and bit-by-bit (list of +bits identified by either index or name). + +Verbose (bit-by-bit) bitsets allow sending symbolic names for bits together +with their values which saves a round trip (when the bitset is passed in a +request) or at least a second request (when the bitset is in a reply). This is +useful for one shot applications like traditional ethtool command. On the +other hand, long running applications like ethtool monitor (displaying +notifications) or network management daemons may prefer fetching the names +only once and using compact form to save message size. Notifications from +ethtool netlink interface always use compact form for bitsets. + +A bitset can represent either a value/mask pair (``ETHTOOL_A_BITSET_NOMASK`` +not set) or a single bitmap (``ETHTOOL_A_BITSET_NOMASK`` set). In requests +modifying a bitmap, the former changes the bit set in mask to values set in +value and preserves the rest; the latter sets the bits set in the bitmap and +clears the rest. + +Compact form: nested (bitset) atrribute contents: + + ============================ ====== ============================ + ``ETHTOOL_A_BITSET_NOMASK`` flag no mask, only a list + ``ETHTOOL_A_BITSET_SIZE`` u32 number of significant bits + ``ETHTOOL_A_BITSET_VALUE`` binary bitmap of bit values + ``ETHTOOL_A_BITSET_MASK`` binary bitmap of valid bits + ============================ ====== ============================ + +Value and mask must have length at least ``ETHTOOL_A_BITSET_SIZE`` bits +rounded up to a multiple of 32 bits. They consist of 32-bit words in host byte +order, words ordered from least significant to most significant (i.e. the same +way as bitmaps are passed with ioctl interface). + +For compact form, ``ETHTOOL_A_BITSET_SIZE`` and ``ETHTOOL_A_BITSET_VALUE`` are +mandatory. ``ETHTOOL_A_BITSET_MASK`` attribute is mandatory if +``ETHTOOL_A_BITSET_NOMASK`` is not set (bitset represents a value/mask pair); +if ``ETHTOOL_A_BITSET_NOMASK`` is not set, ``ETHTOOL_A_BITSET_MASK`` is not +allowed (bitset represents a single bitmap. + +Kernel bit set length may differ from userspace length if older application is +used on newer kernel or vice versa. If userspace bitmap is longer, an error is +issued only if the request actually tries to set values of some bits not +recognized by kernel. + +Bit-by-bit form: nested (bitset) attribute contents: + + +------------------------------------+--------+-----------------------------+ + | ``ETHTOOL_A_BITSET_NOMASK`` | flag | no mask, only a list | + +------------------------------------+--------+-----------------------------+ + | ``ETHTOOL_A_BITSET_SIZE`` | u32 | number of significant bits | + +------------------------------------+--------+-----------------------------+ + | ``ETHTOOL_A_BITSET_BITS`` | nested | array of bits | + +-+----------------------------------+--------+-----------------------------+ + | | ``ETHTOOL_A_BITSET_BITS_BIT+`` | nested | one bit | + +-+-+--------------------------------+--------+-----------------------------+ + | | | ``ETHTOOL_A_BITSET_BIT_INDEX`` | u32 | bit index (0 for LSB) | + +-+-+--------------------------------+--------+-----------------------------+ + | | | ``ETHTOOL_A_BITSET_BIT_NAME`` | string | bit name | + +-+-+--------------------------------+--------+-----------------------------+ + | | | ``ETHTOOL_A_BITSET_BIT_VALUE`` | flag | present if bit is set | + +-+-+--------------------------------+--------+-----------------------------+ + +Bit size is optional for bit-by-bit form. ``ETHTOOL_A_BITSET_BITS`` nest can +only contain ``ETHTOOL_A_BITSET_BITS_BIT`` attributes but there can be an +arbitrary number of them. A bit may be identified by its index or by its +name. When used in requests, listed bits are set to 0 or 1 according to +``ETHTOOL_A_BITSET_BIT_VALUE``, the rest is preserved. A request fails if +index exceeds kernel bit length or if name is not recognized. + +When ``ETHTOOL_A_BITSET_NOMASK`` flag is present, bitset is interpreted as +a simple bitmap. ``ETHTOOL_A_BITSET_BIT_VALUE`` attributes are not used in +such case. Such bitset represents a bitmap with listed bits set and the rest +zero. + +In requests, application can use either form. Form used by kernel in reply is +determined by ``ETHTOOL_FLAG_COMPACT_BITSETS`` flag in flags field of request +header. Semantics of value and mask depends on the attribute. + + List of message types ===================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 82fc3b5f41c9..951203049615 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -50,6 +50,41 @@ enum { ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 }; +/* bit sets */ + +enum { + ETHTOOL_A_BITSET_BIT_UNSPEC, + ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ + ETHTOOL_A_BITSET_BIT_NAME, /* string */ + ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_BIT_CNT, + ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 +}; + +enum { + ETHTOOL_A_BITSET_BITS_UNSPEC, + ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_BITS_CNT, + ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 +}; + +enum { + ETHTOOL_A_BITSET_UNSPEC, + ETHTOOL_A_BITSET_NOMASK, /* flag */ + ETHTOOL_A_BITSET_SIZE, /* u32 */ + ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ + ETHTOOL_A_BITSET_VALUE, /* binary */ + ETHTOOL_A_BITSET_MASK, /* binary */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_CNT, + ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 59d5ee230c29..a7e6c2c85db9 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,4 +4,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o +ethtool_nl-y := netlink.o bitset.o diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c new file mode 100644 index 000000000000..fce45dac4205 --- /dev/null +++ b/net/ethtool/bitset.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include "netlink.h" +#include "bitset.h" + +/* Some bitmaps are internally represented as an array of unsigned long, some + * as an array of u32 (some even as single u32 for now). To avoid the need of + * wrappers on caller side, we provide two set of functions: those with "32" + * suffix in their names expect u32 based bitmaps, those without it expect + * unsigned long bitmaps. + */ + +static u32 ethnl_lower_bits(unsigned int n) +{ + return ~(u32)0 >> (32 - n % 32); +} + +static u32 ethnl_upper_bits(unsigned int n) +{ + return ~(u32)0 << (n % 32); +} + +/** + * ethnl_bitmap32_clear() - Clear u32 based bitmap + * @dst: bitmap to clear + * @start: beginning of the interval + * @end: end of the interval + * @mod: set if bitmap was modified + * + * Clear @nbits bits of a bitmap with indices @start <= i < @end + */ +static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, + bool *mod) +{ + unsigned int start_word = start / 32; + unsigned int end_word = end / 32; + unsigned int i; + u32 mask; + + if (end <= start) + return; + + if (start % 32) { + mask = ethnl_upper_bits(start); + if (end_word == start_word) { + mask &= ethnl_lower_bits(end); + if (dst[start_word] & mask) { + dst[start_word] &= ~mask; + *mod = true; + } + return; + } + if (dst[start_word] & mask) { + dst[start_word] &= ~mask; + *mod = true; + } + start_word++; + } + + for (i = start_word; i < end_word; i++) { + if (dst[i]) { + dst[i] = 0; + *mod = true; + } + } + if (end % 32) { + mask = ethnl_lower_bits(end); + if (dst[end_word] & mask) { + dst[end_word] &= ~mask; + *mod = true; + } + } +} + +/** + * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval + * @map: bitmap to test + * @start: beginning of the interval + * @end: end of the interval + * + * Return: true if there is non-zero bit with index @start <= i < @end, + * false if the whole interval is zero + */ +static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, + unsigned int end) +{ + unsigned int start_word = start / 32; + unsigned int end_word = end / 32; + u32 mask; + + if (end <= start) + return true; + + if (start % 32) { + mask = ethnl_upper_bits(start); + if (end_word == start_word) { + mask &= ethnl_lower_bits(end); + return map[start_word] & mask; + } + if (map[start_word] & mask) + return true; + start_word++; + } + + if (!memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) + return true; + if (end % 32 == 0) + return true; + return map[end_word] & ethnl_lower_bits(end); +} + +/** + * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask + * pair + * @dst: bitmap to update + * @nbits: bit size of the bitmap + * @value: values to set + * @mask: mask of bits to set + * @mod: set to true if bitmap is modified, preserve if not + * + * Set bits in @dst bitmap which are set in @mask to values from @value, leave + * the rest untouched. If destination bitmap was modified, set @mod to true, + * leave as it is if not. + */ +static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, + const u32 *value, const u32 *mask, bool *mod) +{ + while (nbits > 0) { + u32 real_mask = mask ? *mask : ~(u32)0; + u32 new_value; + + if (nbits < 32) + real_mask &= ethnl_lower_bits(nbits); + new_value = (*dst & ~real_mask) | (*value & real_mask); + if (new_value != *dst) { + *dst = new_value; + *mod = true; + } + + if (nbits <= 32) + break; + dst++; + nbits -= 32; + value++; + if (mask) + mask++; + } +} + +static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) +{ + return map[index / 32] & (1U << (index % 32)); +} + +/** + * ethnl_bitset32_size() - Calculate size of bitset nested attribute + * @val: value bitmap (u32 based) + * @mask: mask bitmap (u32 based, optional) + * @nbits: bit length of the bitset + * @names: array of bit names (optional) + * @compact: assume compact format for output + * + * Estimate length of netlink attribute composed by a later call to + * ethnl_put_bitset32() call with the same arguments. + * + * Return: negative error code or attribute length estimate + */ +int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact) +{ + unsigned int len = 0; + + /* list flag */ + if (!mask) + len += nla_total_size(sizeof(u32)); + /* size */ + len += nla_total_size(sizeof(u32)); + + if (compact) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + /* value, mask */ + len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); + } else { + unsigned int bits_len = 0; + unsigned int bit_len, i; + + for (i = 0; i < nbits; i++) { + const char *name = names ? names[i] : NULL; + + if (!ethnl_bitmap32_test_bit(mask ?: val, i)) + continue; + /* index */ + bit_len = nla_total_size(sizeof(u32)); + /* name */ + if (name) + bit_len += ethnl_strz_size(name); + /* value */ + if (mask && ethnl_bitmap32_test_bit(val, i)) + bit_len += nla_total_size(0); + + /* bit nest */ + bits_len += nla_total_size(bit_len); + } + /* bits nest */ + len += nla_total_size(bits_len); + } + + /* outermost nest */ + return nla_total_size(len); +} + +/** + * ethnl_put_bitset32() - Put a bitset nest into a message + * @skb: skb with the message + * @attrtype: attribute type for the bitset nest + * @val: value bitmap (u32 based) + * @mask: mask bitmap (u32 based, optional) + * @nbits: bit length of the bitset + * @names: array of bit names (optional) + * @compact: use compact format for the output + * + * Compose a nested attribute representing a bitset. If @mask is null, simple + * bitmap (bit list) is created, if @mask is provided, represent a value/mask + * pair. Bit names are only used in verbose mode and when provided by calller. + * + * Return: 0 on success, negative error value on error + */ +int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, + const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact) +{ + struct nlattr *nest; + struct nlattr *attr; + + nest = nla_nest_start(skb, attrtype); + if (!nest) + return -EMSGSIZE; + + if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) + goto nla_put_failure; + if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) + goto nla_put_failure; + if (compact) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + unsigned int nbytes = nwords * sizeof(u32); + u32 *dst; + + attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); + if (!attr) + goto nla_put_failure; + dst = nla_data(attr); + memcpy(dst, val, nbytes); + if (nbits % 32) + dst[nwords - 1] &= ethnl_lower_bits(nbits); + + if (mask) { + attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); + if (!attr) + goto nla_put_failure; + dst = nla_data(attr); + memcpy(dst, mask, nbytes); + if (nbits % 32) + dst[nwords - 1] &= ethnl_lower_bits(nbits); + } + } else { + struct nlattr *bits; + unsigned int i; + + bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); + if (!bits) + goto nla_put_failure; + for (i = 0; i < nbits; i++) { + const char *name = names ? names[i] : NULL; + + if (!ethnl_bitmap32_test_bit(mask ?: val, i)) + continue; + attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); + if (!attr) + goto nla_put_failure; + if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) + goto nla_put_failure; + if (name && + ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) + goto nla_put_failure; + if (mask && ethnl_bitmap32_test_bit(val, i) && + nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) + goto nla_put_failure; + nla_nest_end(skb, attr); + } + nla_nest_end(skb, bits); + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = { + [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, + [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, + [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, + [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, +}; + +static const struct nla_policy bit_policy[ETHTOOL_A_BITSET_BIT_MAX + 1] = { + [ETHTOOL_A_BITSET_BIT_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, + [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, +}; + +/** + * ethnl_bitset_is_compact() - check if bitset attribute represents a compact + * bitset + * @bitset: nested attribute representing a bitset + * @compact: pointer for return value + * + * Return: 0 on success, negative error code on failure + */ +int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, bitset, + bitset_policy, NULL); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BITS]) { + if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) + return -EINVAL; + *compact = false; + return 0; + } + if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) + return -EINVAL; + + *compact = true; + return 0; +} + +/** + * ethnl_name_to_idx() - look up string index for a name + * @names: array of ETH_GSTRING_LEN sized strings + * @n_names: number of strings in the array + * @name: name to look up + * + * Return: index of the string if found, -ENOENT if not found + */ +static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, + const char *name) +{ + unsigned int i; + + if (!names) + return -ENOENT; + + for (i = 0; i < n_names; i++) { + /* names[i] may not be null terminated */ + if (!strncmp(names[i], name, ETH_GSTRING_LEN) && + strlen(name) <= ETH_GSTRING_LEN) + return i; + } + + return -ENOENT; +} + +static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, + const struct nlattr *bit_attr, bool no_mask, + ethnl_string_array_t names, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_BIT_MAX + 1]; + int ret, idx; + + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_BIT_MAX, bit_attr, + bit_policy, extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { + const char *name; + + idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); + if (idx >= nbits) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_BITSET_BIT_INDEX], + "bit index too high"); + return -EOPNOTSUPP; + } + name = names ? names[idx] : NULL; + if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && + strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, + nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "bit index and name mismatch"); + return -EINVAL; + } + } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { + idx = ethnl_name_to_idx(names, nbits, + nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); + if (idx < 0) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_BITSET_BIT_NAME], + "bit name not found"); + return -EOPNOTSUPP; + } + } else { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "neither bit index nor name specified"); + return -EINVAL; + } + + *index = idx; + *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; + return 0; +} + +static int +ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, struct nlattr **tb, + ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + struct nlattr *bit_attr; + bool no_mask; + int rem; + int ret; + + if (tb[ETHTOOL_A_BITSET_VALUE]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], + "value only allowed in compact bitset"); + return -EINVAL; + } + if (tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "mask only allowed in compact bitset"); + return -EINVAL; + } + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + + nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { + bool old_val, new_val; + unsigned int idx; + + if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { + NL_SET_ERR_MSG_ATTR(extack, bit_attr, + "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); + return -EINVAL; + } + ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, + names, extack); + if (ret < 0) + return ret; + old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); + if (new_val != old_val) { + if (new_val) + bitmap[idx / 32] |= ((u32)1 << (idx % 32)); + else + bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); + *mod = true; + } + } + + return 0; +} + +static int ethnl_compact_sanity_checks(unsigned int nbits, + const struct nlattr *nest, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + unsigned int attr_nbits, attr_nwords; + const struct nlattr *test_attr; + + if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "mask not allowed in list bitset"); + return -EINVAL; + } + if (!tb[ETHTOOL_A_BITSET_SIZE]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing size in compact bitset"); + return -EINVAL; + } + if (!tb[ETHTOOL_A_BITSET_VALUE]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing value in compact bitset"); + return -EINVAL; + } + if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { + NL_SET_ERR_MSG_ATTR(extack, nest, + "missing mask in compact nonlist bitset"); + return -EINVAL; + } + + attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); + attr_nwords = DIV_ROUND_UP(attr_nbits, 32); + if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], + "bitset value length does not match size"); + return -EINVAL; + } + if (tb[ETHTOOL_A_BITSET_MASK] && + nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], + "bitset mask length does not match size"); + return -EINVAL; + } + if (attr_nbits <= nbits) + return 0; + + test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : + tb[ETHTOOL_A_BITSET_MASK]; + if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { + NL_SET_ERR_MSG_ATTR(extack, test_attr, + "cannot modify bits past kernel bitset size"); + return -EINVAL; + } + return 0; +} + +/** + * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap + * @bitmap: bitmap to update + * @nbits: size of the updated bitmap in bits + * @attr: nest attribute to parse and apply + * @names: array of bit names; may be null for compact format + * @extack: extack for error reporting + * @mod: set this to true if bitmap is modified, leave as it is if not + * + * Apply bitset netsted attribute to a bitmap. If the attribute represents + * a bit list, @bitmap is set to its contents; otherwise, bits in mask are + * set to values from value. Bitmaps in the attribute may be longer than + * @nbits but the message must not request modifying any bits past @nbits. + * + * Return: negative error code on failure, 0 on success + */ +int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1]; + unsigned int change_bits; + bool no_mask; + int ret; + + if (!attr) + return 0; + ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy, + extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_BITSET_BITS]) + return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, + names, extack, mod); + ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); + if (ret < 0) + return ret; + + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + change_bits = min_t(unsigned int, + nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); + ethnl_bitmap32_update(bitmap, change_bits, + nla_data(tb[ETHTOOL_A_BITSET_VALUE]), + no_mask ? NULL : + nla_data(tb[ETHTOOL_A_BITSET_MASK]), + mod); + if (no_mask && change_bits < nbits) + ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); + + return 0; +} + +#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) + +/* 64-bit big endian architectures are the only case when u32 based bitmaps + * and unsigned long based bitmaps have different memory layout so that we + * cannot simply cast the latter to the former and need actual wrappers + * converting the latter to the former. + * + * To reduce the number of slab allocations, the wrappers use fixed size local + * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the + * majority of bitmaps used by ethtool. + */ +#define ETHNL_SMALL_BITMAP_BITS 128 +#define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) + +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; + u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *mask32; + u32 *val32; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); + if (!val32) + return -ENOMEM; + mask32 = val32 + nwords; + } else { + val32 = small_val32; + mask32 = small_mask32; + } + + bitmap_to_arr32(val32, val, nbits); + if (mask) + bitmap_to_arr32(mask32, mask, nbits); + else + mask32 = NULL; + ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(val32); + + return ret; +} + +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; + u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *mask32; + u32 *val32; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int nwords = DIV_ROUND_UP(nbits, 32); + + val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); + if (!val32) + return -ENOMEM; + mask32 = val32 + nwords; + } else { + val32 = small_val32; + mask32 = small_mask32; + } + + bitmap_to_arr32(val32, val, nbits); + if (mask) + bitmap_to_arr32(mask32, mask, nbits); + else + mask32 = NULL; + ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, + compact); + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(val32); + + return ret; +} + +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; + u32 *bitmap32 = small_bitmap32; + bool u32_mod = false; + int ret; + + if (nbits > ETHNL_SMALL_BITMAP_BITS) { + unsigned int dst_words = DIV_ROUND_UP(nbits, 32); + + bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); + if (!bitmap32) + return -ENOMEM; + } + + bitmap_to_arr32(bitmap32, bitmap, nbits); + ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, + &u32_mod); + if (u32_mod) { + bitmap_from_arr32(bitmap, bitmap32, nbits); + *mod = true; + } + + if (nbits > ETHNL_SMALL_BITMAP_BITS) + kfree(bitmap32); + + return ret; +} + +#else + +/* On little endian 64-bit and all 32-bit architectures, an unsigned long + * based bitmap can be interpreted as u32 based one using a simple cast. + */ + +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, + names, compact); +} + +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact) +{ + return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, + (const u32 *)mask, nbits, names, compact); +} + +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod) +{ + return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, + mod); +} + +#endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */ diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h new file mode 100644 index 000000000000..b8247e34109d --- /dev/null +++ b/net/ethtool/bitset.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_BITSET_H +#define _NET_ETHTOOL_BITSET_H + +typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN]; + +int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact); +int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact); +int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact); +int ethnl_put_bitset(struct sk_buff *skb, int attrtype, + const unsigned long *val, const unsigned long *mask, + unsigned int nbits, ethnl_string_array_t names, + bool compact); +int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, + const u32 *mask, unsigned int nbits, + ethnl_string_array_t names, bool compact); +int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod); +int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, + const struct nlattr *attr, ethnl_string_array_t names, + struct netlink_ext_ack *extack, bool *mod); + +#endif /* _NET_ETHTOOL_BITSET_H */ -- cgit v1.2.3 From 6b08d6c146f4c5ed451c45339c10feb06d619db2 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:33 +0100 Subject: ethtool: support for netlink notifications Add infrastructure for ethtool netlink notifications. There is only one multicast group "monitor" which is used to notify userspace about changes and actions performed. Notification messages (types using suffix _NTF) share the format with replies to GET requests. Notifications are supposed to be broadcasted on every configuration change, whether it is done using the netlink interface or ioctl one. Netlink SET requests only trigger a notification if some data is actually changed. To trigger an ethtool notification, both ethtool netlink and external code use ethtool_notify() helper. This helper requires RTNL to be held and may sleep. Handlers sending messages for specific notification message types are registered in ethnl_notify_handlers array. As notifications can be triggered from other code, ethnl_ok flag is used to prevent an attempt to send notification before genetlink family is registered. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 5 +++++ include/linux/netdevice.h | 9 +++++++++ include/uapi/linux/ethtool_netlink.h | 2 ++ net/ethtool/netlink.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+) (limited to 'net') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index f27e92b5f344..c98f6852c8eb 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -5,5 +5,10 @@ #include #include +#include + +enum ethtool_multicast_groups { + ETHNL_MCGRP_MONITOR, +}; #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 469a297b58c0..f007155ae8f4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4393,6 +4393,15 @@ struct netdev_notifier_bonding_info { void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info); +#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) +void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data); +#else +static inline void ethtool_notify(struct net_device *dev, unsigned int cmd, + const void *data) +{ +} +#endif + static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 951203049615..d530ccb6f7e6 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -89,4 +89,6 @@ enum { #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 +#define ETHTOOL_MCGRP_MONITOR_NAME "monitor" + #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index aef882e0c3f5..c0f25c8f3565 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -6,6 +6,8 @@ static struct genl_family ethtool_genl_family; +static bool ethnl_ok __read_mostly; + static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = { [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, @@ -169,11 +171,38 @@ err: return NULL; } +/* notifications */ + +typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, + const void *data); + +static const ethnl_notify_handler_t ethnl_notify_handlers[] = { +}; + +void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) +{ + if (unlikely(!ethnl_ok)) + return; + ASSERT_RTNL(); + + if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && + ethnl_notify_handlers[cmd])) + ethnl_notify_handlers[cmd](dev, cmd, data); + else + WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", + cmd, netdev_name(dev)); +} +EXPORT_SYMBOL(ethtool_notify); + /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { }; +static const struct genl_multicast_group ethtool_nl_mcgrps[] = { + [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, +}; + static struct genl_family ethtool_genl_family = { .name = ETHTOOL_GENL_NAME, .version = ETHTOOL_GENL_VERSION, @@ -181,6 +210,8 @@ static struct genl_family ethtool_genl_family = { .parallel_ops = true, .ops = ethtool_genl_ops, .n_ops = ARRAY_SIZE(ethtool_genl_ops), + .mcgrps = ethtool_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), }; /* module setup */ @@ -192,6 +223,7 @@ static int __init ethnl_init(void) ret = genl_register_family(ðtool_genl_family); if (WARN(ret < 0, "ethtool: genetlink family registration failed")) return ret; + ethnl_ok = true; return 0; } -- cgit v1.2.3 From 728480f1244200fe294073afeb612c583a2080d2 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:38 +0100 Subject: ethtool: default handlers for GET requests Significant part of GET request processing is common for most request types but unfortunately it cannot be easily separated from type specific code as we need to alternate between common actions (parsing common request header, allocating message and filling netlink/genetlink headers etc.) and specific actions (querying the device, composing the reply). The processing also happens in three different situations: "do" request, "dump" request and notification, each doing things in slightly different way. The request specific code is implemented in four or five callbacks defined in an instance of struct get_request_ops: parse_request() - parse incoming message prepare_data() - retrieve data from driver or NIC reply_size() - estimate reply message size fill_reply() - compose reply message cleanup_data() - (optional) clean up additional data Other members of struct get_request_ops describe the data structure holding information from client request and data used to compose the message. The default handlers ethnl_default_doit(), ethnl_default_dumpit(), ethnl_default_start() and ethnl_default_done() can be then used in genl_ops handler. Notification handler will be introduced in a later patch. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 321 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/netlink.h | 116 +++++++++++++++++- 2 files changed, 436 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index c0f25c8f3565..ae63e8423072 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -171,6 +171,327 @@ err: return NULL; } +/* GET request helpers */ + +/** + * struct ethnl_dump_ctx - context structure for generic dumpit() callback + * @ops: request ops of currently processed message type + * @req_info: parsed request header of processed request + * @pos_hash: saved iteration position - hashbucket + * @pos_idx: saved iteration position - index + * + * These parameters are kept in struct netlink_callback as context preserved + * between iterations. They are initialized by ethnl_default_start() and used + * in ethnl_default_dumpit() and ethnl_default_done(). + */ +struct ethnl_dump_ctx { + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct ethnl_reply_data *reply_data; + int pos_hash; + int pos_idx; +}; + +static const struct ethnl_request_ops * +ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { +}; + +static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) +{ + return (struct ethnl_dump_ctx *)cb->ctx; +} + +/** + * ethnl_default_parse() - Parse request message + * @req_info: pointer to structure to put data into + * @nlhdr: pointer to request message header + * @net: request netns + * @request_ops: struct request_ops for request type + * @extack: netlink extack for error reporting + * @require_dev: fail if no device identified in header + * + * Parse universal request header and call request specific ->parse_request() + * callback (if defined) to parse the rest of the message. + * + * Return: 0 on success or negative error code + */ +static int ethnl_default_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + const struct ethnl_request_ops *request_ops, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr **tb; + int ret; + + tb = kmalloc_array(request_ops->max_attr + 1, sizeof(tb[0]), + GFP_KERNEL); + if (!tb) + return -ENOMEM; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, request_ops->max_attr, + request_ops->request_policy, extack); + if (ret < 0) + goto out; + ret = ethnl_parse_header(req_info, tb[request_ops->hdr_attr], net, + extack, require_dev); + if (ret < 0) + goto out; + + if (request_ops->parse_request) { + ret = request_ops->parse_request(req_info, tb, extack); + if (ret < 0) + goto out; + } + + ret = 0; +out: + kfree(tb); + return ret; +} + +/** + * ethnl_init_reply_data() - Initialize reply data for GET request + * @req_info: pointer to embedded struct ethnl_req_info + * @ops: instance of struct ethnl_request_ops describing the layout + * @dev: network device to initialize the reply for + * + * Fills the reply data part with zeros and sets the dev member. Must be called + * before calling the ->fill_reply() callback (for each iteration when handling + * dump requests). + */ +static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, + const struct ethnl_request_ops *ops, + struct net_device *dev) +{ + memset(reply_data, 0, ops->reply_data_size); + reply_data->dev = dev; +} + +/* default ->doit() handler for GET type requests */ +static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_reply_data *reply_data = NULL; + struct ethnl_req_info *req_info = NULL; + const u8 cmd = info->genlhdr->cmd; + const struct ethnl_request_ops *ops; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ops = ethnl_default_requests[cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ret = ethnl_default_parse(req_info, info->nlhdr, genl_info_net(info), ops, + info->extack, !ops->allow_nodev_do); + if (ret < 0) + goto err_dev; + ethnl_init_reply_data(reply_data, ops, req_info->dev); + + rtnl_lock(); + ret = ops->prepare_data(req_info, reply_data, info); + rtnl_unlock(); + if (ret < 0) + goto err_cleanup; + reply_len = ops->reply_size(req_info, reply_data); + if (ret < 0) + goto err_cleanup; + ret = -ENOMEM; + rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, + ops->hdr_attr, info, &reply_payload); + if (!rskb) + goto err_cleanup; + ret = ops->fill_reply(rskb, req_info, reply_data); + if (ret < 0) + goto err_msg; + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + + genlmsg_end(rskb, reply_payload); + if (req_info->dev) + dev_put(req_info->dev); + kfree(reply_data); + kfree(req_info); + return genlmsg_reply(rskb, info); + +err_msg: + WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); + nlmsg_free(rskb); +err_cleanup: + if (ops->cleanup_data) + ops->cleanup_data(reply_data); +err_dev: + if (req_info->dev) + dev_put(req_info->dev); + kfree(reply_data); + kfree(req_info); + return ret; +} + +static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, + const struct ethnl_dump_ctx *ctx) +{ + int ret; + + ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); + rtnl_lock(); + ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL); + rtnl_unlock(); + if (ret < 0) + goto out; + ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); + if (ret < 0) + goto out; + ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); + +out: + if (ctx->ops->cleanup_data) + ctx->ops->cleanup_data(ctx->reply_data); + ctx->reply_data->dev = NULL; + return ret; +} + +/* Default ->dumpit() handler for GET requests. Device iteration copied from + * rtnl_dump_ifinfo(); we have to be more careful about device hashtable + * persistence as we cannot guarantee to hold RTNL lock through the whole + * function as rtnetnlink does. + */ +static int ethnl_default_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + unsigned int seq; + + head = &net->dev_index_head[h]; + +restart_chain: + seq = net->dev_base_seq; + cb->seq = seq; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + dev_hold(dev); + rtnl_unlock(); + + ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, + ctx->ops->reply_cmd); + if (!ehdr) { + dev_put(dev); + ret = -EMSGSIZE; + goto out; + } + ret = ethnl_default_dump_one(skb, dev, ctx); + dev_put(dev); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto lock_and_cont; + if (likely(skb->len)) + ret = skb->len; + goto out; + } + genlmsg_end(skb, ehdr); +lock_and_cont: + rtnl_lock(); + if (net->dev_base_seq != seq) { + s_idx = idx + 1; + goto restart_chain; + } +cont: + idx++; + } + + } + rtnl_unlock(); + +out: + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + return ret; +} + +/* generic ->start() handler for GET requests */ +static int ethnl_default_start(struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct genlmsghdr *ghdr; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ghdr = nlmsg_data(cb->nlh); + ops = ethnl_default_requests[ghdr->cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops, + cb->extack, false); + if (req_info->dev) { + /* We ignore device specification in dump requests but as the + * same parser as for non-dump (doit) requests is used, it + * would take reference to the device if it finds one + */ + dev_put(req_info->dev); + req_info->dev = NULL; + } + if (ret < 0) + return ret; + + ctx->ops = ops; + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_hash = 0; + ctx->pos_idx = 0; + + return 0; +} + +/* default ->done() handler for GET requests */ +static int ethnl_default_done(struct netlink_callback *cb) +{ + struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); + + kfree(ctx->reply_data); + kfree(ctx->req_info); + + return 0; +} + /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 05d7183da894..6ec0dd06277f 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -200,16 +200,130 @@ static inline unsigned int ethnl_reply_header_size(void) nla_total_size(IFNAMSIZ)); } +/* GET request handling */ + +/* Unified processing of GET requests uses two data structures: request info + * and reply data. Request info holds information parsed from client request + * and its stays constant through all request processing. Reply data holds data + * retrieved from ethtool_ops callbacks or other internal sources which is used + * to compose the reply. When processing a dump request, request info is filled + * only once (when the request message is parsed) but reply data is filled for + * each reply message. + * + * Both structures consist of part common for all request types (struct + * ethnl_req_info and struct ethnl_reply_data defined below) and optional + * parts specific for each request type. Common part always starts at offset 0. + */ + /** * struct ethnl_req_info - base type of request information for GET requests * @dev: network device the request is for (may be null) * @flags: request flags common for all request types * - * This is a common base, additional members may follow after this structure. + * This is a common base for request specific structures holding data from + * parsed userspace request. These always embed struct ethnl_req_info at + * zero offset. */ struct ethnl_req_info { struct net_device *dev; u32 flags; }; +/** + * struct ethnl_reply_data - base type of reply data for GET requests + * @dev: device for current reply message; in single shot requests it is + * equal to ðnl_req_info.dev; in dumps it's different for each + * reply message + * + * This is a common base for request specific structures holding data for + * kernel reply message. These always embed struct ethnl_reply_data at zero + * offset. + */ +struct ethnl_reply_data { + struct net_device *dev; +}; + +static inline int ethnl_ops_begin(struct net_device *dev) +{ + if (dev && dev->ethtool_ops->begin) + return dev->ethtool_ops->begin(dev); + else + return 0; +} + +static inline void ethnl_ops_complete(struct net_device *dev) +{ + if (dev && dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); +} + +/** + * struct ethnl_request_ops - unified handling of GET requests + * @request_cmd: command id for request (GET) + * @reply_cmd: command id for reply (GET_REPLY) + * @hdr_attr: attribute type for request header + * @max_attr: maximum (top level) attribute type + * @req_info_size: size of request info + * @reply_data_size: size of reply data + * @request_policy: netlink policy for message contents + * @allow_nodev_do: allow non-dump request with no device identification + * @parse_request: + * Parse request except common header (struct ethnl_req_info). Common + * header is already filled on entry, the rest up to @repdata_offset + * is zero initialized. This callback should only modify type specific + * request info by parsed attributes from request message. + * @prepare_data: + * Retrieve and prepare data needed to compose a reply message. Calls to + * ethtool_ops handlers are limited to this callback. Common reply data + * (struct ethnl_reply_data) is filled on entry, type specific part after + * it is zero initialized. This callback should only modify the type + * specific part of reply data. Device identification from struct + * ethnl_reply_data is to be used as for dump requests, it iterates + * through network devices while dev member of struct ethnl_req_info + * points to the device from client request. + * @reply_size: + * Estimate reply message size. Returned value must be sufficient for + * message payload without common reply header. The callback may returned + * estimate higher than actual message size if exact calculation would + * not be worth the saved memory space. + * @fill_reply: + * Fill reply message payload (except for common header) from reply data. + * The callback must not generate more payload than previously called + * ->reply_size() estimated. + * @cleanup_data: + * Optional cleanup called when reply data is no longer needed. Can be + * used e.g. to free any additional data structures outside the main + * structure which were allocated by ->prepare_data(). When processing + * dump requests, ->cleanup() is called for each message. + * + * Description of variable parts of GET request handling when using the + * unified infrastructure. When used, a pointer to an instance of this + * structure is to be added to ðnl_default_requests array and generic + * handlers ethnl_default_doit(), ethnl_default_dumpit(), + * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops. + */ +struct ethnl_request_ops { + u8 request_cmd; + u8 reply_cmd; + u16 hdr_attr; + unsigned int max_attr; + unsigned int req_info_size; + unsigned int reply_data_size; + const struct nla_policy *request_policy; + bool allow_nodev_do; + + int (*parse_request)(struct ethnl_req_info *req_info, + struct nlattr **tb, + struct netlink_ext_ack *extack); + int (*prepare_data)(const struct ethnl_req_info *req_info, + struct ethnl_reply_data *reply_data, + struct genl_info *info); + int (*reply_size)(const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data); + int (*fill_reply)(struct sk_buff *skb, + const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data); + void (*cleanup_data)(struct ethnl_reply_data *reply_data); +}; + #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 71921690f9745fef60a2bad425f30adf8cdc9da0 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:43 +0100 Subject: ethtool: provide string sets with STRSET_GET request Requests a contents of one or more string sets, i.e. indexed arrays of strings; this information is provided by ETHTOOL_GSSET_INFO and ETHTOOL_GSTRINGS commands of ioctl interface. Unlike ioctl interface, all information can be retrieved with one request and mulitple string sets can be requested at once. There are three types of requests: - no NLM_F_DUMP, no device: get "global" stringsets - no NLM_F_DUMP, with device: get string sets related to the device - NLM_F_DUMP, no device: get device related string sets for all devices Client can request either all string sets of given type (global or device related) or only specific sets. With ETHTOOL_A_STRSET_COUNTS flag set, only set sizes (numbers of strings) are returned. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 75 ++++- include/uapi/linux/ethtool.h | 3 + include/uapi/linux/ethtool_netlink.h | 56 ++++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 8 + net/ethtool/netlink.h | 4 + net/ethtool/strset.c | 425 +++++++++++++++++++++++++++ 7 files changed, 570 insertions(+), 3 deletions(-) create mode 100644 net/ethtool/strset.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 7797f1237472..3912cb0eb9c6 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -176,6 +176,18 @@ according to message purpose: ``_NTF`` kernel notification ============== ====================================== +Userspace to kernel: + + ===================================== ================================ + ``ETHTOOL_MSG_STRSET_GET`` get string set + ===================================== ================================ + +Kernel to userspace: + + ===================================== ================================ + ``ETHTOOL_MSG_STRSET_GET_REPLY`` string set contents + ===================================== ================================ + ``GET`` requests are sent by userspace applications to retrieve device information. They usually do not contain any message specific attributes. Kernel replies with corresponding "GET_REPLY" message. For most types, ``GET`` @@ -207,6 +219,65 @@ an ``ACT_REPLY`` message. Performing an action also triggers a notification Later sections describe the format and semantics of these messages. +STRSET_GET +========== + +Requests contents of a string set as provided by ioctl commands +``ETHTOOL_GSSET_INFO`` and ``ETHTOOL_GSTRINGS.`` String sets are not user +writeable so that the corresponding ``STRSET_SET`` message is only used in +kernel replies. There are two types of string sets: global (independent of +a device, e.g. device feature names) and device specific (e.g. device private +flags). + +Request contents: + + +---------------------------------------+--------+------------------------+ + | ``ETHTOOL_A_STRSET_HEADER`` | nested | request header | + +---------------------------------------+--------+------------------------+ + | ``ETHTOOL_A_STRSET_STRINGSETS`` | nested | string set to request | + +-+-------------------------------------+--------+------------------------+ + | | ``ETHTOOL_A_STRINGSETS_STRINGSET+`` | nested | one string set | + +-+-+-----------------------------------+--------+------------------------+ + | | | ``ETHTOOL_A_STRINGSET_ID`` | u32 | set id | + +-+-+-----------------------------------+--------+------------------------+ + +Kernel response contents: + + +---------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_STRSET_HEADER`` | nested | reply header | + +---------------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_STRSET_STRINGSETS`` | nested | array of string sets | + +-+-------------------------------------+--------+-----------------------+ + | | ``ETHTOOL_A_STRINGSETS_STRINGSET+`` | nested | one string set | + +-+-+-----------------------------------+--------+-----------------------+ + | | | ``ETHTOOL_A_STRINGSET_ID`` | u32 | set id | + +-+-+-----------------------------------+--------+-----------------------+ + | | | ``ETHTOOL_A_STRINGSET_COUNT`` | u32 | number of strings | + +-+-+-----------------------------------+--------+-----------------------+ + | | | ``ETHTOOL_A_STRINGSET_STRINGS`` | nested | array of strings | + +-+-+-+---------------------------------+--------+-----------------------+ + | | | | ``ETHTOOL_A_STRINGS_STRING+`` | nested | one string | + +-+-+-+-+-------------------------------+--------+-----------------------+ + | | | | | ``ETHTOOL_A_STRING_INDEX`` | u32 | string index | + +-+-+-+-+-------------------------------+--------+-----------------------+ + | | | | | ``ETHTOOL_A_STRING_VALUE`` | string | string value | + +-+-+-+-+-------------------------------+--------+-----------------------+ + | ``ETHTOOL_A_STRSET_COUNTS_ONLY`` | flag | return only counts | + +---------------------------------------+--------+-----------------------+ + +Device identification in request header is optional. Depending on its presence +a and ``NLM_F_DUMP`` flag, there are three type of ``STRSET_GET`` requests: + + - no ``NLM_F_DUMP,`` no device: get "global" stringsets + - no ``NLM_F_DUMP``, with device: get string sets related to the device + - ``NLM_F_DUMP``, no device: get device related string sets for all devices + +If there is no ``ETHTOOL_A_STRSET_STRINGSETS`` array, all string sets of +requested type are returned, otherwise only those specified in the request. +Flag ``ETHTOOL_A_STRSET_COUNTS_ONLY`` tells kernel to only return string +counts of the sets, not the actual strings. + + Request translation =================== @@ -242,7 +313,7 @@ have their netlink replacement yet. ``ETHTOOL_GSG`` n/a ``ETHTOOL_SSG`` n/a ``ETHTOOL_TEST`` n/a - ``ETHTOOL_GSTRINGS`` n/a + ``ETHTOOL_GSTRINGS`` ``ETHTOOL_MSG_STRSET_GET`` ``ETHTOOL_PHYS_ID`` n/a ``ETHTOOL_GSTATS`` n/a ``ETHTOOL_GTSO`` n/a @@ -270,7 +341,7 @@ have their netlink replacement yet. ``ETHTOOL_RESET`` n/a ``ETHTOOL_SRXNTUPLE`` n/a ``ETHTOOL_GRXNTUPLE`` n/a - ``ETHTOOL_GSSET_INFO`` n/a + ``ETHTOOL_GSSET_INFO`` ``ETHTOOL_MSG_STRSET_GET`` ``ETHTOOL_GRXFHINDIR`` n/a ``ETHTOOL_SRXFHINDIR`` n/a ``ETHTOOL_GFEATURES`` n/a diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f44155840b07..116bcbf09c74 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -606,6 +606,9 @@ enum ethtool_stringset { ETH_SS_PHY_STATS, ETH_SS_PHY_TUNABLES, ETH_SS_LINK_MODES, + + /* add new constants above here */ + ETH_SS_COUNT }; /** diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index d530ccb6f7e6..cabef1fec42a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -14,6 +14,7 @@ /* message types - userspace to kernel */ enum { ETHTOOL_MSG_USER_NONE, + ETHTOOL_MSG_STRSET_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -23,6 +24,7 @@ enum { /* message types - kernel to userspace */ enum { ETHTOOL_MSG_KERNEL_NONE, + ETHTOOL_MSG_STRSET_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -85,6 +87,60 @@ enum { ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 }; +/* string sets */ + +enum { + ETHTOOL_A_STRING_UNSPEC, + ETHTOOL_A_STRING_INDEX, /* u32 */ + ETHTOOL_A_STRING_VALUE, /* string */ + + /* add new constants above here */ + __ETHTOOL_A_STRING_CNT, + ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGS_UNSPEC, + ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGS_CNT, + ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGSET_UNSPEC, + ETHTOOL_A_STRINGSET_ID, /* u32 */ + ETHTOOL_A_STRINGSET_COUNT, /* u32 */ + ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGSET_CNT, + ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGSETS_UNSPEC, + ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGSETS_CNT, + ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 +}; + +/* STRSET */ + +enum { + ETHTOOL_A_STRSET_UNSPEC, + ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ + ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ + + /* add new constants above here */ + __ETHTOOL_A_STRSET_CNT, + ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index a7e6c2c85db9..efcc42c34d62 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,4 +4,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o bitset.o +ethtool_nl-y := netlink.o bitset.o strset.o diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index ae63e8423072..7dc082bde670 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -194,6 +194,7 @@ struct ethnl_dump_ctx { static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { + [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -518,6 +519,13 @@ EXPORT_SYMBOL(ethtool_notify); /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { + { + .cmd = ETHTOOL_MSG_STRSET_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 6ec0dd06277f..44e9f63aefb7 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -326,4 +326,8 @@ struct ethnl_request_ops { void (*cleanup_data)(struct ethnl_reply_data *reply_data); }; +/* request handlers */ + +extern const struct ethnl_request_ops ethnl_strset_request_ops; + #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c new file mode 100644 index 000000000000..9f2243329015 --- /dev/null +++ b/net/ethtool/strset.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include "netlink.h" +#include "common.h" + +struct strset_info { + bool per_dev; + bool free_strings; + unsigned int count; + const char (*strings)[ETH_GSTRING_LEN]; +}; + +static const struct strset_info info_template[] = { + [ETH_SS_TEST] = { + .per_dev = true, + }, + [ETH_SS_STATS] = { + .per_dev = true, + }, + [ETH_SS_PRIV_FLAGS] = { + .per_dev = true, + }, + [ETH_SS_FEATURES] = { + .per_dev = false, + .count = ARRAY_SIZE(netdev_features_strings), + .strings = netdev_features_strings, + }, + [ETH_SS_RSS_HASH_FUNCS] = { + .per_dev = false, + .count = ARRAY_SIZE(rss_hash_func_strings), + .strings = rss_hash_func_strings, + }, + [ETH_SS_TUNABLES] = { + .per_dev = false, + .count = ARRAY_SIZE(tunable_strings), + .strings = tunable_strings, + }, + [ETH_SS_PHY_STATS] = { + .per_dev = true, + }, + [ETH_SS_PHY_TUNABLES] = { + .per_dev = false, + .count = ARRAY_SIZE(phy_tunable_strings), + .strings = phy_tunable_strings, + }, + [ETH_SS_LINK_MODES] = { + .per_dev = false, + .count = __ETHTOOL_LINK_MODE_MASK_NBITS, + .strings = link_mode_names, + }, +}; + +struct strset_req_info { + struct ethnl_req_info base; + u32 req_ids; + bool counts_only; +}; + +#define STRSET_REQINFO(__req_base) \ + container_of(__req_base, struct strset_req_info, base) + +struct strset_reply_data { + struct ethnl_reply_data base; + struct strset_info sets[ETH_SS_COUNT]; +}; + +#define STRSET_REPDATA(__reply_base) \ + container_of(__reply_base, struct strset_reply_data, base) + +static const struct nla_policy strset_get_policy[ETHTOOL_A_STRSET_MAX + 1] = { + [ETHTOOL_A_STRSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRSET_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +get_stringset_policy[ETHTOOL_A_STRINGSET_MAX + 1] = { + [ETHTOOL_A_STRINGSET_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 }, + [ETHTOOL_A_STRINGSET_COUNT] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSET_STRINGS] = { .type = NLA_REJECT }, +}; + +/** + * strset_include() - test if a string set should be included in reply + * @data: pointer to request data structure + * @id: id of string set to check (ETH_SS_* constants) + */ +static bool strset_include(const struct strset_req_info *info, + const struct strset_reply_data *data, u32 id) +{ + bool per_dev; + + BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids)); + + if (info->req_ids) + return info->req_ids & (1U << id); + per_dev = data->sets[id].per_dev; + if (!per_dev && !data->sets[id].strings) + return false; + + return data->base.dev ? per_dev : !per_dev; +} + +static int strset_get_id(const struct nlattr *nest, u32 *val, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ETHTOOL_A_STRINGSET_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_STRINGSET_MAX, nest, + get_stringset_policy, extack); + if (ret < 0) + return ret; + if (!tb[ETHTOOL_A_STRINGSET_ID]) + return -EINVAL; + + *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]); + return 0; +} + +static const struct nla_policy +strset_stringsets_policy[ETHTOOL_A_STRINGSETS_MAX + 1] = { + [ETHTOOL_A_STRINGSETS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED }, +}; + +static int strset_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct strset_req_info *req_info = STRSET_REQINFO(req_base); + struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS]; + struct nlattr *attr; + int rem, ret; + + if (!nest) + return 0; + ret = nla_validate_nested(nest, ETHTOOL_A_STRINGSETS_MAX, + strset_stringsets_policy, extack); + if (ret < 0) + return ret; + + req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY]; + nla_for_each_nested(attr, nest, rem) { + u32 id; + + if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET, + "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n", + nla_type(attr))) + return -EINVAL; + + ret = strset_get_id(attr, &id, extack); + if (ret < 0) + return ret; + if (ret >= ETH_SS_COUNT) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "unknown string set id"); + return -EOPNOTSUPP; + } + + req_info->req_ids |= (1U << id); + } + + return 0; +} + +static void strset_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct strset_reply_data *data = STRSET_REPDATA(reply_base); + unsigned int i; + + for (i = 0; i < ETH_SS_COUNT; i++) + if (data->sets[i].free_strings) { + kfree(data->sets[i].strings); + data->sets[i].strings = NULL; + data->sets[i].free_strings = false; + } +} + +static int strset_prepare_set(struct strset_info *info, struct net_device *dev, + unsigned int id, bool counts_only) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + void *strings; + int count, ret; + + if (id == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + ret = phy_ethtool_get_sset_count(dev->phydev); + else if (ops->get_sset_count && ops->get_strings) + ret = ops->get_sset_count(dev, id); + else + ret = -EOPNOTSUPP; + if (ret <= 0) { + info->count = 0; + return 0; + } + + count = ret; + if (!counts_only) { + strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); + if (!strings) + return -ENOMEM; + if (id == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + phy_ethtool_get_strings(dev->phydev, strings); + else + ops->get_strings(dev, id, strings); + info->strings = strings; + info->free_strings = true; + } + info->count = count; + + return 0; +} + +static int strset_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + struct strset_reply_data *data = STRSET_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + unsigned int i; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT); + memcpy(&data->sets, &info_template, sizeof(data->sets)); + + if (!dev) { + for (i = 0; i < ETH_SS_COUNT; i++) { + if ((req_info->req_ids & (1U << i)) && + data->sets[i].per_dev) { + if (info) + GENL_SET_ERR_MSG(info, "requested per device strings without dev"); + return -EINVAL; + } + } + } + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto err_strset; + for (i = 0; i < ETH_SS_COUNT; i++) { + if (!strset_include(req_info, data, i) || + !data->sets[i].per_dev) + continue; + + ret = strset_prepare_set(&data->sets[i], dev, i, + req_info->counts_only); + if (ret < 0) + goto err_ops; + } + ethnl_ops_complete(dev); + + return 0; +err_ops: + ethnl_ops_complete(dev); +err_strset: + strset_cleanup_data(reply_base); + return ret; +} + +/* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */ +static int strset_set_size(const struct strset_info *info, bool counts_only) +{ + unsigned int len = 0; + unsigned int i; + + if (info->count == 0) + return 0; + if (counts_only) + return nla_total_size(2 * nla_total_size(sizeof(u32))); + + for (i = 0; i < info->count; i++) { + const char *str = info->strings[i]; + + /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */ + len += nla_total_size(nla_total_size(sizeof(u32)) + + ethnl_strz_size(str)); + } + /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */ + len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len); + + return nla_total_size(len); +} + +static int strset_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + const struct strset_reply_data *data = STRSET_REPDATA(reply_base); + unsigned int i; + int len = 0; + int ret; + + len += ethnl_reply_header_size(); + for (i = 0; i < ETH_SS_COUNT; i++) { + const struct strset_info *set_info = &data->sets[i]; + + if (!strset_include(req_info, data, i)) + continue; + + ret = strset_set_size(set_info, req_info->counts_only); + if (ret < 0) + return ret; + len += ret; + } + + return len; +} + +/* fill one string into reply */ +static int strset_fill_string(struct sk_buff *skb, + const struct strset_info *set_info, u32 idx) +{ + struct nlattr *string_attr; + const char *value; + + value = set_info->strings[idx]; + + string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING); + if (!string_attr) + return -EMSGSIZE; + if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) || + ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value)) + goto nla_put_failure; + nla_nest_end(skb, string_attr); + + return 0; +nla_put_failure: + nla_nest_cancel(skb, string_attr); + return -EMSGSIZE; +} + +/* fill one string set into reply */ +static int strset_fill_set(struct sk_buff *skb, + const struct strset_info *set_info, u32 id, + bool counts_only) +{ + struct nlattr *stringset_attr; + struct nlattr *strings_attr; + unsigned int i; + + if (!set_info->per_dev && !set_info->strings) + return -EOPNOTSUPP; + if (set_info->count == 0) + return 0; + stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET); + if (!stringset_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) || + nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count)) + goto nla_put_failure; + + if (!counts_only) { + strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS); + if (!strings_attr) + goto nla_put_failure; + for (i = 0; i < set_info->count; i++) { + if (strset_fill_string(skb, set_info, i) < 0) + goto nla_put_failure; + } + nla_nest_end(skb, strings_attr); + } + + nla_nest_end(skb, stringset_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, stringset_attr); + return -EMSGSIZE; +} + +static int strset_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct strset_req_info *req_info = STRSET_REQINFO(req_base); + const struct strset_reply_data *data = STRSET_REPDATA(reply_base); + struct nlattr *nest; + unsigned int i; + int ret; + + nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS); + if (!nest) + return -EMSGSIZE; + + for (i = 0; i < ETH_SS_COUNT; i++) { + if (strset_include(req_info, data, i)) { + ret = strset_fill_set(skb, &data->sets[i], i, + req_info->counts_only); + if (ret < 0) + goto nla_put_failure; + } + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return ret; +} + +const struct ethnl_request_ops ethnl_strset_request_ops = { + .request_cmd = ETHTOOL_MSG_STRSET_GET, + .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY, + .hdr_attr = ETHTOOL_A_STRSET_HEADER, + .max_attr = ETHTOOL_A_STRSET_MAX, + .req_info_size = sizeof(struct strset_req_info), + .reply_data_size = sizeof(struct strset_reply_data), + .request_policy = strset_get_policy, + .allow_nodev_do = true, + + .parse_request = strset_parse_request, + .prepare_data = strset_prepare_data, + .reply_size = strset_reply_size, + .fill_reply = strset_fill_reply, + .cleanup_data = strset_cleanup_data, +}; -- cgit v1.2.3 From 459e0b81b37043545d90629fdfb243444151e77d Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:48 +0100 Subject: ethtool: provide link settings with LINKINFO_GET request Implement LINKINFO_GET netlink request to get basic link settings provided by ETHTOOL_GLINKSETTINGS and ETHTOOL_GSET ioctl commands. This request provides settings not directly related to autonegotiation and link mode selection: physical port, phy MDIO address, MDI(-X) status, MDI(-X) control and transceiver. LINKINFO_GET request can be used with NLM_F_DUMP (without device identification) to request the information for all devices in current network namespace providing the data. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 37 ++++++++++- include/uapi/linux/ethtool_netlink.h | 18 ++++++ net/ethtool/Makefile | 2 +- net/ethtool/common.c | 48 ++++++++++++++ net/ethtool/common.h | 4 ++ net/ethtool/ioctl.c | 48 -------------- net/ethtool/linkinfo.c | 94 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 8 +++ net/ethtool/netlink.h | 1 + 9 files changed, 209 insertions(+), 51 deletions(-) create mode 100644 net/ethtool/linkinfo.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 3912cb0eb9c6..60684786a92c 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -180,12 +180,14 @@ Userspace to kernel: ===================================== ================================ ``ETHTOOL_MSG_STRSET_GET`` get string set + ``ETHTOOL_MSG_LINKINFO_GET`` get link settings ===================================== ================================ Kernel to userspace: ===================================== ================================ ``ETHTOOL_MSG_STRSET_GET_REPLY`` string set contents + ``ETHTOOL_MSG_LINKINFO_GET_REPLY`` link settings ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device @@ -278,6 +280,37 @@ Flag ``ETHTOOL_A_STRSET_COUNTS_ONLY`` tells kernel to only return string counts of the sets, not the actual strings. +LINKINFO_GET +============ + +Requests link settings as provided by ``ETHTOOL_GLINKSETTINGS`` except for +link modes and autonegotiation related information. The request does not use +any attributes. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKINFO_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKINFO_HEADER`` nested reply header + ``ETHTOOL_A_LINKINFO_PORT`` u8 physical port + ``ETHTOOL_A_LINKINFO_PHYADDR`` u8 phy MDIO address + ``ETHTOOL_A_LINKINFO_TP_MDIX`` u8 MDI(-X) status + ``ETHTOOL_A_LINKINFO_TP_MDIX_CTRL`` u8 MDI(-X) control + ``ETHTOOL_A_LINKINFO_TRANSCEIVER`` u8 transceiver + ==================================== ====== ========================== + +Attributes and their values have the same meaning as matching members of the +corresponding ioctl structures. + +``LINKINFO_GET`` allows dump requests (kernel returns reply message for all +devices supporting the request). + + Request translation =================== @@ -288,7 +321,7 @@ have their netlink replacement yet. =================================== ===================================== ioctl command netlink command =================================== ===================================== - ``ETHTOOL_GSET`` n/a + ``ETHTOOL_GSET`` ``ETHTOOL_MSG_LINKINFO_GET`` ``ETHTOOL_SSET`` n/a ``ETHTOOL_GDRVINFO`` n/a ``ETHTOOL_GREGS`` n/a @@ -362,7 +395,7 @@ have their netlink replacement yet. ``ETHTOOL_STUNABLE`` n/a ``ETHTOOL_GPHYSTATS`` n/a ``ETHTOOL_PERQUEUE`` n/a - ``ETHTOOL_GLINKSETTINGS`` n/a + ``ETHTOOL_GLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_GET`` ``ETHTOOL_SLINKSETTINGS`` n/a ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index cabef1fec42a..1966532993e5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -15,6 +15,7 @@ enum { ETHTOOL_MSG_USER_NONE, ETHTOOL_MSG_STRSET_GET, + ETHTOOL_MSG_LINKINFO_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -25,6 +26,7 @@ enum { enum { ETHTOOL_MSG_KERNEL_NONE, ETHTOOL_MSG_STRSET_GET_REPLY, + ETHTOOL_MSG_LINKINFO_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -141,6 +143,22 @@ enum { ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 }; +/* LINKINFO */ + +enum { + ETHTOOL_A_LINKINFO_UNSPEC, + ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKINFO_PORT, /* u8 */ + ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ + ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ + ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ + ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKINFO_CNT, + ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index efcc42c34d62..765736ec52c0 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,4 +4,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o bitset.o strset.o +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 0a8728565356..1d4a0aeff2cb 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -169,3 +169,51 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, CR8, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); + +/* return false if legacy contained non-0 deprecated fields + * maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + ethtool_convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h index bbb788908cb1..c8a237402729 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -19,4 +19,8 @@ extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; +bool convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings); + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 88f7cddf5a6f..8a0a13b478e0 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -358,54 +358,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); -/* return false if legacy contained non-0 deprecated fields - * maxtxpkt/maxrxpkt. rest of ksettings always updated - */ -static bool -convert_legacy_settings_to_link_ksettings( - struct ethtool_link_ksettings *link_ksettings, - const struct ethtool_cmd *legacy_settings) -{ - bool retval = true; - - memset(link_ksettings, 0, sizeof(*link_ksettings)); - - /* This is used to tell users that driver is still using these - * deprecated legacy fields, and they should not use - * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS - */ - if (legacy_settings->maxtxpkt || - legacy_settings->maxrxpkt) - retval = false; - - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.supported, - legacy_settings->supported); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.advertising, - legacy_settings->advertising); - ethtool_convert_legacy_u32_to_link_mode( - link_ksettings->link_modes.lp_advertising, - legacy_settings->lp_advertising); - link_ksettings->base.speed - = ethtool_cmd_speed(legacy_settings); - link_ksettings->base.duplex - = legacy_settings->duplex; - link_ksettings->base.port - = legacy_settings->port; - link_ksettings->base.phy_address - = legacy_settings->phy_address; - link_ksettings->base.autoneg - = legacy_settings->autoneg; - link_ksettings->base.mdio_support - = legacy_settings->mdio_support; - link_ksettings->base.eth_tp_mdix - = legacy_settings->eth_tp_mdix; - link_ksettings->base.eth_tp_mdix_ctrl - = legacy_settings->eth_tp_mdix_ctrl; - return retval; -} - /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c new file mode 100644 index 000000000000..f52a31af6fff --- /dev/null +++ b/net/ethtool/linkinfo.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct linkinfo_req_info { + struct ethnl_req_info base; +}; + +struct linkinfo_reply_data { + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; +}; + +#define LINKINFO_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkinfo_reply_data, base) + +static const struct nla_policy +linkinfo_get_policy[ETHTOOL_A_LINKINFO_MAX + 1] = { + [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT }, +}; + +static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + data->lsettings = &data->ksettings.base; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = __ethtool_get_link_ksettings(dev, &data->ksettings); + if (ret < 0 && info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + ethnl_ops_complete(dev); + + return ret; +} + +static int linkinfo_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + + 0; +} + +static int linkinfo_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); + + if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, + data->lsettings->phy_address) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, + data->lsettings->eth_tp_mdix) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, + data->lsettings->eth_tp_mdix_ctrl) || + nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, + data->lsettings->transceiver)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkinfo_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKINFO_GET, + .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, + .max_attr = ETHTOOL_A_LINKINFO_MAX, + .req_info_size = sizeof(struct linkinfo_req_info), + .reply_data_size = sizeof(struct linkinfo_reply_data), + .request_policy = linkinfo_get_policy, + + .prepare_data = linkinfo_prepare_data, + .reply_size = linkinfo_reply_size, + .fill_reply = linkinfo_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 7dc082bde670..ce86d97c922e 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -195,6 +195,7 @@ struct ethnl_dump_ctx { static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, + [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -526,6 +527,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_LINKINFO_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 44e9f63aefb7..9fc8f94d8dce 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -329,5 +329,6 @@ struct ethnl_request_ops { /* request handlers */ extern const struct ethnl_request_ops ethnl_strset_request_ops; +extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From a53f3d41e4d3df46aba254ba51e7fbe45470fece Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:53 +0100 Subject: ethtool: set link settings with LINKINFO_SET request Implement LINKINFO_SET netlink request to set link settings queried by LINKINFO_GET message. Only physical port, phy MDIO address and MDI(-X) control can be set, attempt to modify MDI(-X) status and transceiver is rejected. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 24 +++++++++- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/linkinfo.c | 71 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 5 ++ net/ethtool/netlink.h | 2 + 5 files changed, 101 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 60684786a92c..39c4aba324c3 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -181,6 +181,7 @@ Userspace to kernel: ===================================== ================================ ``ETHTOOL_MSG_STRSET_GET`` get string set ``ETHTOOL_MSG_LINKINFO_GET`` get link settings + ``ETHTOOL_MSG_LINKINFO_SET`` set link settings ===================================== ================================ Kernel to userspace: @@ -311,6 +312,25 @@ corresponding ioctl structures. devices supporting the request). +LINKINFO_SET +============ + +``LINKINFO_SET`` request allows setting some of the attributes reported by +``LINKINFO_GET``. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKINFO_HEADER`` nested request header + ``ETHTOOL_A_LINKINFO_PORT`` u8 physical port + ``ETHTOOL_A_LINKINFO_PHYADDR`` u8 phy MDIO address + ``ETHTOOL_A_LINKINFO_TP_MDIX_CTRL`` u8 MDI(-X) control + ==================================== ====== ========================== + +MDI(-X) status and transceiver cannot be set, request with the corresponding +attributes is rejected. + + Request translation =================== @@ -322,7 +342,7 @@ have their netlink replacement yet. ioctl command netlink command =================================== ===================================== ``ETHTOOL_GSET`` ``ETHTOOL_MSG_LINKINFO_GET`` - ``ETHTOOL_SSET`` n/a + ``ETHTOOL_SSET`` ``ETHTOOL_MSG_LINKINFO_SET`` ``ETHTOOL_GDRVINFO`` n/a ``ETHTOOL_GREGS`` n/a ``ETHTOOL_GWOL`` n/a @@ -396,7 +416,7 @@ have their netlink replacement yet. ``ETHTOOL_GPHYSTATS`` n/a ``ETHTOOL_PERQUEUE`` n/a ``ETHTOOL_GLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_GET`` - ``ETHTOOL_SLINKSETTINGS`` n/a + ``ETHTOOL_SLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_SET`` ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a ``ETHTOOL_GFECPARAM`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 1966532993e5..5b7806a5bef8 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -16,6 +16,7 @@ enum { ETHTOOL_MSG_USER_NONE, ETHTOOL_MSG_STRSET_GET, ETHTOOL_MSG_LINKINFO_GET, + ETHTOOL_MSG_LINKINFO_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index f52a31af6fff..8a5f68f92425 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -92,3 +92,74 @@ const struct ethnl_request_ops ethnl_linkinfo_request_ops = { .reply_size = linkinfo_reply_size, .fill_reply = linkinfo_fill_reply, }; + +/* LINKINFO_SET */ + +static const struct nla_policy +linkinfo_set_policy[ETHTOOL_A_LINKINFO_MAX + 1] = { + [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT }, +}; + +int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_LINKINFO_MAX + 1]; + struct ethtool_link_ksettings ksettings = {}; + struct ethtool_link_settings *lsettings; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_LINKINFO_MAX, linkinfo_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKINFO_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_link_ksettings || + !dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = __ethtool_get_link_ksettings(dev, &ksettings); + if (ret < 0) { + if (info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out_ops; + } + lsettings = &ksettings.base; + + ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); + ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], + &mod); + ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, + tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); + ret = 0; + if (!mod) + goto out_ops; + + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) + GENL_SET_ERR_MSG(info, "link settings update failed"); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index ce86d97c922e..7867425956f6 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -534,6 +534,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_LINKINFO_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_linkinfo, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9fc8f94d8dce..bbe5fe60a023 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -331,4 +331,6 @@ struct ethnl_request_ops { extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; +int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); + #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 5cf2a548bcbd3da3e79adb0b64ef96dbb9988d78 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:55:58 +0100 Subject: ethtool: add default notification handler The ethtool netlink notifications have the same format as related GET replies so that if generic GET handling framework is used to process GET requests, its callbacks and instance of struct get_request_ops can be also used to compose corresponding notification message. Provide function ethnl_std_notify() to be used as notification handler in ethnl_notify_handlers table. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/netlink.h | 4 ++- 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 7867425956f6..057b67f8ba8c 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -7,6 +7,7 @@ static struct genl_family ethtool_genl_family; static bool ethnl_ok __read_mostly; +static u32 ethnl_bcast_seq; static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = { [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT }, @@ -171,6 +172,18 @@ err: return NULL; } +static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) +{ + return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, + cmd); +} + +static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) +{ + return genlmsg_multicast_netns(ðtool_genl_family, dev_net(dev), skb, + 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); +} + /* GET request helpers */ /** @@ -494,6 +507,82 @@ static int ethnl_default_done(struct netlink_callback *cb) return 0; } +static const struct ethnl_request_ops * +ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { +}; + +/* default notification handler */ +static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, + const void *data) +{ + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct sk_buff *skb; + void *reply_payload; + int reply_len; + int ret; + + if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || + !ethnl_default_notify_ops[cmd], + "unexpected notification type %u\n", cmd)) + return; + ops = ethnl_default_notify_ops[cmd]; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return; + } + + req_info->dev = dev; + req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; + + ethnl_init_reply_data(reply_data, ops, dev); + ret = ops->prepare_data(req_info, reply_data, NULL); + if (ret < 0) + goto err_cleanup; + reply_len = ops->reply_size(req_info, reply_data); + if (ret < 0) + goto err_cleanup; + ret = -ENOMEM; + skb = genlmsg_new(reply_len, GFP_KERNEL); + if (!skb) + goto err_cleanup; + reply_payload = ethnl_bcastmsg_put(skb, cmd); + if (!reply_payload) + goto err_skb; + ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); + if (ret < 0) + goto err_msg; + ret = ops->fill_reply(skb, req_info, reply_data); + if (ret < 0) + goto err_msg; + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + + genlmsg_end(skb, reply_payload); + kfree(reply_data); + kfree(req_info); + ethnl_multicast(skb, dev); + return; + +err_msg: + WARN_ONCE(ret == -EMSGSIZE, + "calculated message payload length (%d) not sufficient\n", + reply_len); +err_skb: + nlmsg_free(skb); +err_cleanup: + if (ops->cleanup_data) + ops->cleanup_data(reply_data); + kfree(reply_data); + kfree(req_info); + return; +} + /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index bbe5fe60a023..5d56d7779a06 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -300,7 +300,9 @@ static inline void ethnl_ops_complete(struct net_device *dev) * unified infrastructure. When used, a pointer to an instance of this * structure is to be added to ðnl_default_requests array and generic * handlers ethnl_default_doit(), ethnl_default_dumpit(), - * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops. + * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; + * ethnl_default_notify() can be used in @ethnl_notify_handlers to send + * notifications of the corresponding type. */ struct ethnl_request_ops { u8 request_cmd; -- cgit v1.2.3 From 73286734c1b0d009fd317c2a74e173cb22233dad Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:56:03 +0100 Subject: ethtool: add LINKINFO_NTF notification Send ETHTOOL_MSG_LINKINFO_NTF notification message whenever device link settings are modified using ETHTOOL_MSG_LINKINFO_SET netlink message or ETHTOOL_SLINKSETTINGS or ETHTOOL_SSET ioctl commands. The notification message has the same format as reply to LINKINFO_GET request. ETHTOOL_MSG_LINKINFO_SET netlink request only triggers the notification if there is a change but the ioctl command handlers do not check if there is an actual change and trigger the notification whenever the commands are executed. As all work is done by ethnl_default_notify() handler and callback functions introduced to handle LINKINFO_GET requests, all that remains is adding entries for ETHTOOL_MSG_LINKINFO_NTF into ethnl_notify_handlers and ethnl_default_notify_ops lookup tables and calls to ethtool_notify() where needed. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 1 + include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/ioctl.c | 12 ++++++++++-- net/ethtool/linkinfo.c | 2 ++ net/ethtool/netlink.c | 2 ++ 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 39c4aba324c3..34254482d295 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -189,6 +189,7 @@ Kernel to userspace: ===================================== ================================ ``ETHTOOL_MSG_STRSET_GET_REPLY`` string set contents ``ETHTOOL_MSG_LINKINFO_GET_REPLY`` link settings + ``ETHTOOL_MSG_LINKINFO_NTF`` link settings notification ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5b7806a5bef8..d530fa30de36 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -28,6 +28,7 @@ enum { ETHTOOL_MSG_KERNEL_NONE, ETHTOOL_MSG_STRSET_GET_REPLY, ETHTOOL_MSG_LINKINFO_GET_REPLY, + ETHTOOL_MSG_LINKINFO_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 8a0a13b478e0..11a467294a33 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "common.h" @@ -571,7 +572,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + if (err >= 0) + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + return err; } /* Query device for its ethtool_cmd settings. @@ -620,6 +624,7 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; + int ret; ASSERT_RTNL(); @@ -632,7 +637,10 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return -EINVAL; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; - return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); + if (ret >= 0) + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + return ret; } static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 8a5f68f92425..5d16cb4e8693 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -155,6 +155,8 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) GENL_SET_ERR_MSG(info, "link settings update failed"); + else + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); out_ops: ethnl_ops_complete(dev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 057b67f8ba8c..942da4ebdfe9 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -509,6 +509,7 @@ static int ethnl_default_done(struct netlink_callback *cb) static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { + [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, }; /* default notification handler */ @@ -589,6 +590,7 @@ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, const void *data); static const ethnl_notify_handler_t ethnl_notify_handlers[] = { + [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) -- cgit v1.2.3 From f625aa9be8c10f2e4dc677837e240730a25feda7 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:56:08 +0100 Subject: ethtool: provide link mode information with LINKMODES_GET request Implement LINKMODES_GET netlink request to get link modes related information provided by ETHTOOL_GLINKSETTINGS and ETHTOOL_GSET ioctl commands. This request provides supported, advertised and peer advertised link modes, autonegotiation flag, speed and duplex. LINKMODES_GET request can be used with NLM_F_DUMP (without device identification) to request the information for all devices in current network namespace providing the data. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 36 +++++++ include/linux/ethtool_netlink.h | 3 + include/uapi/linux/ethtool_netlink.h | 18 ++++ net/ethtool/Makefile | 2 +- net/ethtool/linkmodes.c | 140 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 8 ++ net/ethtool/netlink.h | 1 + 7 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/linkmodes.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 34254482d295..04c55be0264c 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -182,6 +182,7 @@ Userspace to kernel: ``ETHTOOL_MSG_STRSET_GET`` get string set ``ETHTOOL_MSG_LINKINFO_GET`` get link settings ``ETHTOOL_MSG_LINKINFO_SET`` set link settings + ``ETHTOOL_MSG_LINKMODES_GET`` get link modes info ===================================== ================================ Kernel to userspace: @@ -190,6 +191,7 @@ Kernel to userspace: ``ETHTOOL_MSG_STRSET_GET_REPLY`` string set contents ``ETHTOOL_MSG_LINKINFO_GET_REPLY`` link settings ``ETHTOOL_MSG_LINKINFO_NTF`` link settings notification + ``ETHTOOL_MSG_LINKMODES_GET_REPLY`` link modes info ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device @@ -332,6 +334,38 @@ MDI(-X) status and transceiver cannot be set, request with the corresponding attributes is rejected. +LINKMODES_GET +============= + +Requests link modes (supported, advertised and peer advertised) and related +information (autonegotiation status, link speed and duplex) as provided by +``ETHTOOL_GLINKSETTINGS``. The request does not use any attributes. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested reply header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ==================================== ====== ========================== + +For ``ETHTOOL_A_LINKMODES_OURS``, value represents advertised modes and mask +represents supported modes. ``ETHTOOL_A_LINKMODES_PEER`` in the reply is a bit +list. + +``LINKMODES_GET`` allows dump requests (kernel returns reply messages for all +devices supporting the request). + + Request translation =================== @@ -343,6 +377,7 @@ have their netlink replacement yet. ioctl command netlink command =================================== ===================================== ``ETHTOOL_GSET`` ``ETHTOOL_MSG_LINKINFO_GET`` + ``ETHTOOL_MSG_LINKMODES_GET`` ``ETHTOOL_SSET`` ``ETHTOOL_MSG_LINKINFO_SET`` ``ETHTOOL_GDRVINFO`` n/a ``ETHTOOL_GREGS`` n/a @@ -417,6 +452,7 @@ have their netlink replacement yet. ``ETHTOOL_GPHYSTATS`` n/a ``ETHTOOL_PERQUEUE`` n/a ``ETHTOOL_GLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_GET`` + ``ETHTOOL_MSG_LINKMODES_GET`` ``ETHTOOL_SLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_SET`` ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index c98f6852c8eb..d01b77887f82 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -7,6 +7,9 @@ #include #include +#define __ETHTOOL_LINK_MODE_MASK_NWORDS \ + DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) + enum ethtool_multicast_groups { ETHNL_MCGRP_MONITOR, }; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index d530fa30de36..dc1cae052eee 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -17,6 +17,7 @@ enum { ETHTOOL_MSG_STRSET_GET, ETHTOOL_MSG_LINKINFO_GET, ETHTOOL_MSG_LINKINFO_SET, + ETHTOOL_MSG_LINKMODES_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -29,6 +30,7 @@ enum { ETHTOOL_MSG_STRSET_GET_REPLY, ETHTOOL_MSG_LINKINFO_GET_REPLY, ETHTOOL_MSG_LINKINFO_NTF, + ETHTOOL_MSG_LINKMODES_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -161,6 +163,22 @@ enum { ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 }; +/* LINKMODES */ + +enum { + ETHTOOL_A_LINKMODES_UNSPEC, + ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ + ETHTOOL_A_LINKMODES_OURS, /* bitset */ + ETHTOOL_A_LINKMODES_PEER, /* bitset */ + ETHTOOL_A_LINKMODES_SPEED, /* u32 */ + ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKMODES_CNT, + ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 765736ec52c0..8023da6672ce 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,4 +4,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c new file mode 100644 index 000000000000..81856fa1e632 --- /dev/null +++ b/net/ethtool/linkmodes.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct linkmodes_req_info { + struct ethnl_req_info base; +}; + +struct linkmodes_reply_data { + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; + bool peer_empty; +}; + +#define LINKMODES_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkmodes_reply_data, base) + +static const struct nla_policy +linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { + [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, +}; + +static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + data->lsettings = &data->ksettings.base; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = __ethtool_get_link_ksettings(dev, &data->ksettings); + if (ret < 0 && info) { + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out; + } + + data->peer_empty = + bitmap_empty(data->ksettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + +out: + ethnl_ops_complete(dev); + return ret; +} + +static int linkmodes_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + const struct ethtool_link_ksettings *ksettings = &data->ksettings; + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int len, ret; + + len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + + 0; + ret = ethnl_bitset_size(ksettings->link_modes.advertising, + ksettings->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + if (!data->peer_empty) { + ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising, + NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + } + + return len; +} + +static int linkmodes_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); + const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) + return -EMSGSIZE; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS, + ksettings->link_modes.advertising, + ksettings->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, + compact); + if (ret < 0) + return -EMSGSIZE; + if (!data->peer_empty) { + ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER, + ksettings->link_modes.lp_advertising, + NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return -EMSGSIZE; + } + + if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) || + nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkmodes_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKMODES_GET, + .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKMODES_HEADER, + .max_attr = ETHTOOL_A_LINKMODES_MAX, + .req_info_size = sizeof(struct linkmodes_req_info), + .reply_data_size = sizeof(struct linkmodes_reply_data), + .request_policy = linkmodes_get_policy, + + .prepare_data = linkmodes_prepare_data, + .reply_size = linkmodes_reply_size, + .fill_reply = linkmodes_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 942da4ebdfe9..703ff3a227a4 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -209,6 +209,7 @@ static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -630,6 +631,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_linkinfo, }, + { + .cmd = ETHTOOL_MSG_LINKMODES_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 5d56d7779a06..256d38972d1e 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -332,6 +332,7 @@ struct ethnl_request_ops { extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; +extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From bfbcfe2032e70bd8598d680d39ac177d507e39ac Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:56:13 +0100 Subject: ethtool: set link modes related data with LINKMODES_SET request Implement LINKMODES_SET netlink request to set advertised linkmodes and related attributes as ETHTOOL_SLINKSETTINGS and ETHTOOL_SSET commands do. The request allows setting autonegotiation flag, speed, duplex and advertised link modes. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 27 +++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/linkmodes.c | 235 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 5 + net/ethtool/netlink.h | 1 + 5 files changed, 269 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 04c55be0264c..625c80183563 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -183,6 +183,7 @@ Userspace to kernel: ``ETHTOOL_MSG_LINKINFO_GET`` get link settings ``ETHTOOL_MSG_LINKINFO_SET`` set link settings ``ETHTOOL_MSG_LINKMODES_GET`` get link modes info + ``ETHTOOL_MSG_LINKMODES_SET`` set link modes info ===================================== ================================ Kernel to userspace: @@ -366,6 +367,30 @@ list. devices supporting the request). +LINKMODES_SET +============= + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKMODES_HEADER`` nested request header + ``ETHTOOL_A_LINKMODES_AUTONEG`` u8 autonegotiation status + ``ETHTOOL_A_LINKMODES_OURS`` bitset advertised link modes + ``ETHTOOL_A_LINKMODES_PEER`` bitset partner link modes + ``ETHTOOL_A_LINKMODES_SPEED`` u32 link speed (Mb/s) + ``ETHTOOL_A_LINKMODES_DUPLEX`` u8 duplex mode + ==================================== ====== ========================== + +``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If +autonegotiation is on (either set now or kept from before), advertised modes +are not changed (no ``ETHTOOL_A_LINKMODES_OURS`` attribute) and at least one +of speed and duplex is specified, kernel adjusts advertised modes to all +supported modes matching speed, duplex or both (whatever is specified). This +autoselection is done on ethtool side with ioctl interface, netlink interface +is supposed to allow requesting changes without knowing what exactly kernel +supports. + + Request translation =================== @@ -379,6 +404,7 @@ have their netlink replacement yet. ``ETHTOOL_GSET`` ``ETHTOOL_MSG_LINKINFO_GET`` ``ETHTOOL_MSG_LINKMODES_GET`` ``ETHTOOL_SSET`` ``ETHTOOL_MSG_LINKINFO_SET`` + ``ETHTOOL_MSG_LINKMODES_SET`` ``ETHTOOL_GDRVINFO`` n/a ``ETHTOOL_GREGS`` n/a ``ETHTOOL_GWOL`` n/a @@ -454,6 +480,7 @@ have their netlink replacement yet. ``ETHTOOL_GLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_GET`` ``ETHTOOL_MSG_LINKMODES_GET`` ``ETHTOOL_SLINKSETTINGS`` ``ETHTOOL_MSG_LINKINFO_SET`` + ``ETHTOOL_MSG_LINKMODES_SET`` ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a ``ETHTOOL_GFECPARAM`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index dc1cae052eee..cddf978b98df 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -18,6 +18,7 @@ enum { ETHTOOL_MSG_LINKINFO_GET, ETHTOOL_MSG_LINKINFO_SET, ETHTOOL_MSG_LINKMODES_GET, + ETHTOOL_MSG_LINKMODES_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 81856fa1e632..790b60771d0e 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -138,3 +138,238 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = { .reply_size = linkmodes_reply_size, .fill_reply = linkmodes_fill_reply, }; + +/* LINKMODES_SET */ + +struct link_mode_info { + int speed; + u8 duplex; +}; + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .duplex = __DUPLEX_ ## _duplex \ + } +#define __DUPLEX_Half DUPLEX_HALF +#define __DUPLEX_Full DUPLEX_FULL +#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ + [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \ + .speed = SPEED_UNKNOWN, \ + .duplex = DUPLEX_UNKNOWN, \ + } + +static const struct link_mode_info link_mode_params[] = { + __DEFINE_LINK_MODE_PARAMS(10, T, Half), + __DEFINE_LINK_MODE_PARAMS(10, T, Full), + __DEFINE_LINK_MODE_PARAMS(100, T, Half), + __DEFINE_LINK_MODE_PARAMS(100, T, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T, Half), + __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), + __DEFINE_SPECIAL_MODE_PARAMS(TP), + __DEFINE_SPECIAL_MODE_PARAMS(AUI), + __DEFINE_SPECIAL_MODE_PARAMS(MII), + __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), + __DEFINE_SPECIAL_MODE_PARAMS(BNC), + __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Pause), + __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), + __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_SPECIAL_MODE_PARAMS(Backplane), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { + .speed = SPEED_10000, + .duplex = DUPLEX_FULL, + }, + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(1000, X, Full), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), + __DEFINE_LINK_MODE_PARAMS(2500, T, Full), + __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, T1, Full), + __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), +}; + +static const struct nla_policy +linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { + [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, + [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, +}; + +/* Set advertised link modes to all supported modes matching requested speed + * and duplex values. Called when autonegotiation is on, speed or duplex is + * requested but no link mode change. This is done in userspace with ioctl() + * interface, move it into kernel for netlink. + * Returns true if advertised modes bitmap was modified. + */ +static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, + bool req_speed, bool req_duplex) +{ + unsigned long *advertising = ksettings->link_modes.advertising; + unsigned long *supported = ksettings->link_modes.supported; + DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS); + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) != + __ETHTOOL_LINK_MODE_MASK_NBITS); + + bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); + + for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) { + const struct link_mode_info *info = &link_mode_params[i]; + + if (info->speed == SPEED_UNKNOWN) + continue; + if (test_bit(i, supported) && + (!req_speed || info->speed == ksettings->base.speed) && + (!req_duplex || info->duplex == ksettings->base.duplex)) + set_bit(i, advertising); + else + clear_bit(i, advertising); + } + + return !bitmap_equal(old_adv, advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, + struct ethtool_link_ksettings *ksettings, + bool *mod) +{ + struct ethtool_link_settings *lsettings = &ksettings->base; + bool req_speed, req_duplex; + int ret; + + *mod = false; + req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; + req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; + + ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG], + mod); + ret = ethnl_update_bitset(ksettings->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names, + info->extack, mod); + if (ret < 0) + return ret; + ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED], + mod); + ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], + mod); + + if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && + (req_speed || req_duplex) && + ethnl_auto_linkmodes(ksettings, req_speed, req_duplex)) + *mod = true; + + return 0; +} + +int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_LINKMODES_MAX + 1]; + struct ethtool_link_ksettings ksettings = {}; + struct ethtool_link_settings *lsettings; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_LINKMODES_MAX, linkmodes_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKMODES_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_link_ksettings || + !dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = __ethtool_get_link_ksettings(dev, &ksettings); + if (ret < 0) { + if (info) + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + goto out_ops; + } + lsettings = &ksettings.base; + + ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod); + if (ret < 0) + goto out_ops; + + if (mod) { + ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); + if (ret < 0) + GENL_SET_ERR_MSG(info, "link settings update failed"); + } + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 703ff3a227a4..5f28f3cb022d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -638,6 +638,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_LINKMODES_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_linkmodes, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 256d38972d1e..1269cca8a002 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -335,5 +335,6 @@ extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 1b1b1847c8505df2e1dd8804838526bed22a8bd4 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:56:18 +0100 Subject: ethtool: add LINKMODES_NTF notification Send ETHTOOL_MSG_LINKMODES_NTF notification message whenever device link settings or advertised modes are modified using ETHTOOL_MSG_LINKMODES_SET netlink message or ETHTOOL_SLINKSETTINGS or ETHTOOL_SSET ioctl commands. The notification message has the same format as reply to LINKMODES_GET request. ETHTOOL_MSG_LINKMODES_SET netlink request only triggers the notification if there is a change but the ioctl command handlers do not check if there is an actual change and trigger the notification whenever the commands are executed. As all work is done by ethnl_default_notify() handler and callback functions introduced to handle LINKMODES_GET requests, all that remains is adding entries for ETHTOOL_MSG_LINKMODES_NTF into ethnl_notify_handlers and ethnl_default_notify_ops lookup tables and calls to ethtool_notify() where needed. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 1 + include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/ioctl.c | 8 ++++++-- net/ethtool/linkmodes.c | 2 ++ net/ethtool/netlink.c | 2 ++ 5 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 625c80183563..9d96d51e9360 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -193,6 +193,7 @@ Kernel to userspace: ``ETHTOOL_MSG_LINKINFO_GET_REPLY`` link settings ``ETHTOOL_MSG_LINKINFO_NTF`` link settings notification ``ETHTOOL_MSG_LINKMODES_GET_REPLY`` link modes info + ``ETHTOOL_MSG_LINKMODES_NTF`` link modes notification ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index cddf978b98df..35948df6d6e3 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -32,6 +32,7 @@ enum { ETHTOOL_MSG_LINKINFO_GET_REPLY, ETHTOOL_MSG_LINKINFO_NTF, ETHTOOL_MSG_LINKMODES_GET_REPLY, + ETHTOOL_MSG_LINKMODES_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 11a467294a33..36e2ef2d900d 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -573,8 +573,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev, return -EINVAL; err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); - if (err >= 0) + if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } return err; } @@ -638,8 +640,10 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); - if (ret >= 0) + if (ret >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + } return ret; } diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 790b60771d0e..0b99f494ad3b 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -364,6 +364,8 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) GENL_SET_ERR_MSG(info, "link settings update failed"); + else + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } out_ops: diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5f28f3cb022d..1b5e1bd26504 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -511,6 +511,7 @@ static int ethnl_default_done(struct netlink_callback *cb) static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, + [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, }; /* default notification handler */ @@ -592,6 +593,7 @@ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) -- cgit v1.2.3 From 3d2b847fb99cf2b28aa046e486636e555bc6ed1c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 27 Dec 2019 15:56:23 +0100 Subject: ethtool: provide link state with LINKSTATE_GET request Implement LINKSTATE_GET netlink request to get link state information. At the moment, only link up flag as provided by ETHTOOL_GLINK ioctl command is returned. LINKSTATE_GET request can be used with NLM_F_DUMP (without device identification) to request the information for all devices in current network namespace providing the data. Signed-off-by: Michal Kubecek Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 33 ++++++++++++- include/uapi/linux/ethtool_netlink.h | 14 ++++++ net/ethtool/Makefile | 3 +- net/ethtool/common.c | 8 +++ net/ethtool/common.h | 3 ++ net/ethtool/ioctl.c | 8 +-- net/ethtool/linkstate.c | 74 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 8 +++ net/ethtool/netlink.h | 1 + 9 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 net/ethtool/linkstate.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 9d96d51e9360..c60afba69e3c 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -184,6 +184,7 @@ Userspace to kernel: ``ETHTOOL_MSG_LINKINFO_SET`` set link settings ``ETHTOOL_MSG_LINKMODES_GET`` get link modes info ``ETHTOOL_MSG_LINKMODES_SET`` set link modes info + ``ETHTOOL_MSG_LINKSTATE_GET`` get link state ===================================== ================================ Kernel to userspace: @@ -194,6 +195,7 @@ Kernel to userspace: ``ETHTOOL_MSG_LINKINFO_NTF`` link settings notification ``ETHTOOL_MSG_LINKMODES_GET_REPLY`` link modes info ``ETHTOOL_MSG_LINKMODES_NTF`` link modes notification + ``ETHTOOL_MSG_LINKSTATE_GET_REPLY`` link state info ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device @@ -392,6 +394,35 @@ is supposed to allow requesting changes without knowing what exactly kernel supports. +LINKSTATE_GET +============= + +Requests link state information. At the moment, only link up/down flag (as +provided by ``ETHTOOL_GLINK`` ioctl command) is provided but some future +extensions are planned (e.g. link down reason). This request does not have any +attributes. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKSTATE_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_LINKSTATE_HEADER`` nested reply header + ``ETHTOOL_A_LINKSTATE_LINK`` bool link state (up/down) + ==================================== ====== ========================== + +For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns +carrier flag provided by ``netif_carrier_ok()`` but there are drivers which +define their own handler. + +``LINKSTATE_GET`` allows dump requests (kernel returns reply messages for all +devices supporting the request). + + Request translation =================== @@ -413,7 +444,7 @@ have their netlink replacement yet. ``ETHTOOL_GMSGLVL`` n/a ``ETHTOOL_SMSGLVL`` n/a ``ETHTOOL_NWAY_RST`` n/a - ``ETHTOOL_GLINK`` n/a + ``ETHTOOL_GLINK`` ``ETHTOOL_MSG_LINKSTATE_GET`` ``ETHTOOL_GEEPROM`` n/a ``ETHTOOL_SEEPROM`` n/a ``ETHTOOL_GCOALESCE`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 35948df6d6e3..02f82f42a889 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -19,6 +19,7 @@ enum { ETHTOOL_MSG_LINKINFO_SET, ETHTOOL_MSG_LINKMODES_GET, ETHTOOL_MSG_LINKMODES_SET, + ETHTOOL_MSG_LINKSTATE_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -33,6 +34,7 @@ enum { ETHTOOL_MSG_LINKINFO_NTF, ETHTOOL_MSG_LINKMODES_GET_REPLY, ETHTOOL_MSG_LINKMODES_NTF, + ETHTOOL_MSG_LINKSTATE_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -181,6 +183,18 @@ enum { ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 }; +/* LINKSTATE */ + +enum { + ETHTOOL_A_LINKSTATE_UNSPEC, + ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKSTATE_LINK, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKSTATE_CNT, + ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 8023da6672ce..9a1332fb0cc6 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,4 +4,5 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ + linkstate.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 1d4a0aeff2cb..e621b1694d2f 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -217,3 +217,11 @@ convert_legacy_settings_to_link_ksettings( = legacy_settings->eth_tp_mdix_ctrl; return retval; } + +int __ethtool_get_link(struct net_device *dev) +{ + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + return netif_running(dev) && dev->ethtool_ops->get_link(dev); +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c8a237402729..5c5f7dc90cd4 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -3,6 +3,7 @@ #ifndef _ETHTOOL_COMMON_H #define _ETHTOOL_COMMON_H +#include #include /* compose link mode index from speed, type and duplex */ @@ -19,6 +20,8 @@ extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; +int __ethtool_get_link(struct net_device *dev); + bool convert_legacy_settings_to_link_ksettings( struct ethtool_link_ksettings *link_ksettings, const struct ethtool_cmd *legacy_settings); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 36e2ef2d900d..182bffbffa78 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1365,12 +1365,12 @@ static int ethtool_nway_reset(struct net_device *dev) static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + int link = __ethtool_get_link(dev); - if (!dev->ethtool_ops->get_link) - return -EOPNOTSUPP; - - edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + if (link < 0) + return link; + edata.data = link; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c new file mode 100644 index 000000000000..2740cde0a182 --- /dev/null +++ b/net/ethtool/linkstate.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct linkstate_req_info { + struct ethnl_req_info base; +}; + +struct linkstate_reply_data { + struct ethnl_reply_data base; + int link; +}; + +#define LINKSTATE_REPDATA(__reply_base) \ + container_of(__reply_base, struct linkstate_reply_data, base) + +static const struct nla_policy +linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { + [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, +}; + +static int linkstate_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->link = __ethtool_get_link(dev); + ethnl_ops_complete(dev); + + return 0; +} + +static int linkstate_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + + 0; +} + +static int linkstate_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + + if (data->link >= 0 && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_linkstate_request_ops = { + .request_cmd = ETHTOOL_MSG_LINKSTATE_GET, + .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY, + .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER, + .max_attr = ETHTOOL_A_LINKSTATE_MAX, + .req_info_size = sizeof(struct linkstate_req_info), + .reply_data_size = sizeof(struct linkstate_reply_data), + .request_policy = linkstate_get_policy, + + .prepare_data = linkstate_prepare_data, + .reply_size = linkstate_reply_size, + .fill_reply = linkstate_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 1b5e1bd26504..4ca96c7b86b3 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -210,6 +210,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -645,6 +646,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_linkmodes, }, + { + .cmd = ETHTOOL_MSG_LINKSTATE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 1269cca8a002..da9d6521a4eb 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -333,6 +333,7 @@ struct ethnl_request_ops { extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; +extern const struct ethnl_request_ops ethnl_linkstate_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From f685e609a301f171b62ee52a69acc3a74c2c04aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 28 Dec 2019 15:30:46 +0200 Subject: net: dsa: Deny PTP on master if switch supports it It is possible to kill PTP on a DSA switch completely and absolutely, until a reboot, with a simple command: tcpdump -i eth2 -j adapter_unsynced where eth2 is the switch's DSA master. Why? Well, in short, the PTP API in place today is a bit rudimentary and relies on applications to retrieve the TX timestamps by polling the error queue and looking at the cmsg structure. But there is no timestamp identification of any sorts (except whether it's HW or SW), you don't know how many more timestamps are there to come, which one is this one, from whom it is, etc. In other words, the SO_TIMESTAMPING API is fundamentally limited in that you can get a single HW timestamp from the stack. And the "-j adapter_unsynced" flag of tcpdump enables hardware timestamping. So let's imagine what happens when the DSA master decides it wants to deliver TX timestamps to the skb's socket too: - The timestamp that the user space sees is taken by the DSA master. Whereas the RX timestamp will eventually be overwritten by the DSA switch. So the RX and TX timestamps will be in different time bases (aka garbage). - The user space applications have no way to deal with the second (real) TX timestamp finally delivered by the DSA switch, or even to know to wait for it. Take ptp4l from the linuxptp project, for example. This is its behavior after running tcpdump, before the patch: ptp4l[172]: [6469.594] Unexpected data on socket err queue: ptp4l[172]: [6469.693] rms 8 max 16 freq -21257 +/- 11 delay 748 +/- 0 ptp4l[172]: [6469.711] Unexpected data on socket err queue: ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 05 00 fd ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: [6469.721] Unexpected data on socket err queue: ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02 ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 01 c6 b1 00 fd ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: [6469.838] Unexpected data on socket err queue: ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02 ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 06 00 fd ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: [6469.848] Unexpected data on socket err queue: ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 13 02 ptp4l[172]: 0010 00 36 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 04 1a 45 05 7f ptp4l[172]: 0030 00 00 5e 05 41 32 27 c2 1a 68 00 04 9f ff fe 05 ptp4l[172]: 0040 de 06 00 01 ptp4l[172]: [6469.855] Unexpected data on socket err queue: ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02 ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 01 c6 b2 00 fd ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: [6469.974] Unexpected data on socket err queue: ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02 ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 07 00 fd ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00 The ptp4l program itself is heavily patched to show this (more details here [0]). Otherwise, by default it just hangs. On the other hand, with the DSA patch to disallow HW timestamping applied: tcpdump -i eth2 -j adapter_unsynced tcpdump: SIOCSHWTSTAMP failed: Device or resource busy So it is a fact of life that PTP timestamping on the DSA master is incompatible with timestamping on the switch MAC, at least with the current API. And if the switch supports PTP, taking the timestamps from the switch MAC is highly preferable anyway, due to the fact that those don't contain the queuing latencies of the switch. So just disallow PTP on the DSA master if there is any PTP-capable switch attached. [0]: https://sourceforge.net/p/linuxptp/mailman/message/36880648/ Fixes: 0336369d3a4d ("net: dsa: forward hardware timestamping ioctls to switch driver") Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: David S. Miller --- net/dsa/master.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'net') diff --git a/net/dsa/master.c b/net/dsa/master.c index 3255dfc97f86..bd44bde272f4 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -197,6 +197,35 @@ static int dsa_master_get_phys_port_name(struct net_device *dev, return 0; } +static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch *ds = cpu_dp->ds; + struct dsa_switch_tree *dst; + int err = -EOPNOTSUPP; + struct dsa_port *dp; + + dst = ds->dst; + + switch (cmd) { + case SIOCGHWTSTAMP: + case SIOCSHWTSTAMP: + /* Deny PTP operations on master if there is at least one + * switch in the tree that is PTP capable. + */ + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->ops->port_hwtstamp_get || + dp->ds->ops->port_hwtstamp_set) + return -EBUSY; + break; + } + + if (cpu_dp->orig_ndo_ops && cpu_dp->orig_ndo_ops->ndo_do_ioctl) + err = cpu_dp->orig_ndo_ops->ndo_do_ioctl(dev, ifr, cmd); + + return err; +} + static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -249,6 +278,7 @@ static int dsa_master_ndo_setup(struct net_device *dev) memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops)); ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; + ops->ndo_do_ioctl = dsa_master_ioctl; dev->netdev_ops = ops; -- cgit v1.2.3 From f278b99ca6b2d91a5744588d81bae297179b0d1f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2019 06:06:19 -0800 Subject: tcp_cubic: refactor code to perform a divide only when needed Neal Cardwell suggested to not change ca->delay_min and apply the ack delay cushion only when Hystart ACK train is still under consideration. This should avoid a 64bit divide unless needed. Tested: 40Gbit(mlx4) testbed (with sch_fq as packet scheduler) $ echo -n 'file tcp_cubic.c +p' >/sys/kernel/debug/dynamic_debug/control $ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart" 14815 16280 15293 15563 11574 15145 14789 18548 16972 12520 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 1396 0.0 $ dmesg | tail -10 [ 4873.951350] hystart_ack_train (116 > 93) delay_min 24 (+ ack_delay 69) cwnd 80 [ 4875.155379] hystart_ack_train (55 > 50) delay_min 21 (+ ack_delay 29) cwnd 160 [ 4876.333921] hystart_ack_train (69 > 62) delay_min 23 (+ ack_delay 39) cwnd 130 [ 4877.519037] hystart_ack_train (69 > 60) delay_min 22 (+ ack_delay 38) cwnd 130 [ 4878.701559] hystart_ack_train (87 > 63) delay_min 24 (+ ack_delay 39) cwnd 160 [ 4879.844597] hystart_ack_train (93 > 50) delay_min 21 (+ ack_delay 29) cwnd 216 [ 4880.956650] hystart_ack_train (74 > 67) delay_min 20 (+ ack_delay 47) cwnd 108 [ 4882.098500] hystart_ack_train (61 > 57) delay_min 23 (+ ack_delay 34) cwnd 130 [ 4883.262056] hystart_ack_train (72 > 67) delay_min 21 (+ ack_delay 46) cwnd 130 [ 4884.418760] hystart_ack_train (74 > 67) delay_min 29 (+ ack_delay 38) cwnd 152 10Gbit(bnx2x) testbed (with sch_fq as packet scheduler) $ echo -n 'file tcp_cubic.c +p' >/sys/kernel/debug/dynamic_debug/control $ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpk52 -l -4000000; done;nstat|egrep "Hystart" 7050 7065 7100 6900 7202 7263 7189 6869 7463 7034 TcpExtTCPHystartTrainDetect 10 0.0 TcpExtTCPHystartTrainCwnd 3199 0.0 $ dmesg | tail -10 [ 176.920012] hystart_ack_train (161 > 141) delay_min 83 (+ ack_delay 58) cwnd 264 [ 179.144645] hystart_ack_train (164 > 159) delay_min 120 (+ ack_delay 39) cwnd 444 [ 181.354527] hystart_ack_train (214 > 168) delay_min 125 (+ ack_delay 43) cwnd 436 [ 183.539565] hystart_ack_train (170 > 147) delay_min 96 (+ ack_delay 51) cwnd 326 [ 185.727309] hystart_ack_train (177 > 160) delay_min 61 (+ ack_delay 99) cwnd 128 [ 187.947142] hystart_ack_train (184 > 167) delay_min 123 (+ ack_delay 44) cwnd 367 [ 190.166680] hystart_ack_train (230 > 153) delay_min 116 (+ ack_delay 37) cwnd 444 [ 192.327285] hystart_ack_train (210 > 206) delay_min 86 (+ ack_delay 120) cwnd 152 [ 194.511392] hystart_ack_train (173 > 151) delay_min 94 (+ ack_delay 57) cwnd 239 [ 196.736023] hystart_ack_train (149 > 146) delay_min 105 (+ ack_delay 41) cwnd 399 Fixes: 42f3a8aaae66 ("tcp_cubic: tweak Hystart detection for short RTT flows") Signed-off-by: Eric Dumazet Reported-by: Neal Cardwell Link: https://www.spinics.net/lists/netdev/msg621886.html Link: https://www.spinics.net/lists/netdev/msg621797.html Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index d02bb283c689..8f8eefd3a3ce 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -372,6 +372,26 @@ static void bictcp_state(struct sock *sk, u8 new_state) } } +/* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, since during + * slow start we begin with small TSO packets and ca->delay_min would + * not account for long aggregation delay when TSO packets get bigger. + * Ideally even with a very small RTT we would like to have at least one + * TSO packet being sent and received by GRO, and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at this point. + * We cap the cushion to 1ms. + */ +static u32 hystart_ack_delay(struct sock *sk) +{ + unsigned long rate; + + rate = READ_ONCE(sk->sk_pacing_rate); + if (!rate) + return 0; + return min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); +} + static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); @@ -385,7 +405,8 @@ static void hystart_update(struct sock *sk, u32 delay) if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - threshold = ca->delay_min; + threshold = ca->delay_min + hystart_ack_delay(sk); + /* Hystart ack train triggers if we get ack past * ca->delay_min/2. * Pacing might have delayed packets up to RTT/2 @@ -396,6 +417,9 @@ static void hystart_update(struct sock *sk, u32 delay) if ((s32)(now - ca->round_start) > threshold) { ca->found = 1; + pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", + now - ca->round_start, threshold, + ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -447,30 +471,11 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) delay = 1; /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) { - unsigned long rate = READ_ONCE(sk->sk_pacing_rate); - - /* Account for TSO/GRO delays. - * Otherwise short RTT flows could get too small ssthresh, - * since during slow start we begin with small TSO packets - * and could lower ca->delay_min too much. - * Ideally even with a very small RTT we would like to have - * at least one TSO packet being sent and received by GRO, - * and another one in qdisc layer. - * We apply another 100% factor because @rate is doubled at - * this point. - * We cap the cushion to 1ms. - */ - if (rate) - delay += min_t(u64, USEC_PER_MSEC, - div64_ul((u64)GSO_MAX_SIZE * - 4 * USEC_PER_SEC, rate)); - if (ca->delay_min == 0 || ca->delay_min > delay) - ca->delay_min = delay; - } + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (!ca->found && hystart && tcp_in_slow_start(tp) && + if (!ca->found && tcp_in_slow_start(tp) && hystart && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } -- cgit v1.2.3 From f398efc14a9277b55defff71f59a46fdf13c713f Mon Sep 17 00:00:00 2001 From: Kevin Kou Date: Fri, 27 Dec 2019 13:11:16 +0000 Subject: sctp: add enabled check for path tracepoint loop. sctp_outq_sack is the main function handles SACK, it is called very frequently. As the commit "move trace_sctp_probe_path into sctp_outq_sack" added below code to this function, sctp tracepoint is disabled most of time, but the loop of transport list will be always called even though the tracepoint is disabled, this is unnecessary. + /* SCTP path tracepoint for congestion control debugging. */ + list_for_each_entry(transport, transport_list, transports) { + trace_sctp_probe_path(transport, asoc); + } This patch is to add tracepoint enabled check at outside of the loop of transport list, and avoid traversing the loop when trace is disabled, it is a small optimization. Signed-off-by: Kevin Kou Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 6b0b3bad4daa..577e3bc4ee6f 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1240,8 +1240,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) transport_list = &asoc->peer.transport_addr_list; /* SCTP path tracepoint for congestion control debugging. */ - list_for_each_entry(transport, transport_list, transports) { - trace_sctp_probe_path(transport, asoc); + if (trace_sctp_probe_path_enabled()) { + list_for_each_entry(transport, transport_list, transports) + trace_sctp_probe_path(transport, asoc); } sack_ctsn = ntohl(sack->cum_tsn_ack); -- cgit v1.2.3 From 9e860947d8d7a1504476ac49abfce90a4ce600f3 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Fri, 27 Dec 2019 14:43:49 -0800 Subject: net/ncsi: Fix gma flag setting after response gma_flag was set at the time of GMA command request but it should only be set after getting successful response. Movinng this flag setting in GMA response handler. This flag is used mainly for not repeating GMA command once received MAC address. Signed-off-by: Vijay Khemka Reviewed-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 3 --- net/ncsi/ncsi-rsp.c | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 70fe02697544..e20b81514029 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -764,9 +764,6 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) return -1; } - /* Set the flag for GMA command which should only be called once */ - nca->ndp->gma_flag = 1; - /* Get Mac address from NCSI device */ return nch->handler(nca); } diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index d5611f04926d..a94bb59793f0 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -627,6 +627,9 @@ static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr) saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN); + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); @@ -671,6 +674,9 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) return -ENXIO; + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); -- cgit v1.2.3 From 68e039f966cb577c91649a02591646ac3919f8c9 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 1 Jan 2020 00:00:01 +0100 Subject: batman-adv: Update copyright years for 2020 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- include/uapi/linux/batman_adv.h | 2 +- net/batman-adv/Kconfig | 2 +- net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.c | 2 +- net/batman-adv/bat_algo.h | 2 +- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_iv_ogm.h | 2 +- net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v.h | 2 +- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_elp.h | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/bat_v_ogm.h | 2 +- net/batman-adv/bitarray.c | 2 +- net/batman-adv/bitarray.h | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/bridge_loop_avoidance.h | 2 +- net/batman-adv/debugfs.c | 2 +- net/batman-adv/debugfs.h | 2 +- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/fragmentation.h | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/gateway_common.h | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hard-interface.h | 2 +- net/batman-adv/hash.c | 2 +- net/batman-adv/hash.h | 2 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/icmp_socket.h | 2 +- net/batman-adv/log.c | 2 +- net/batman-adv/log.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/multicast.h | 2 +- net/batman-adv/netlink.c | 2 +- net/batman-adv/netlink.h | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/network-coding.h | 2 +- net/batman-adv/originator.c | 2 +- net/batman-adv/originator.h | 2 +- net/batman-adv/routing.c | 2 +- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 2 +- net/batman-adv/send.h | 2 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/soft-interface.h | 2 +- net/batman-adv/sysfs.c | 2 +- net/batman-adv/sysfs.h | 2 +- net/batman-adv/tp_meter.c | 2 +- net/batman-adv/tp_meter.h | 2 +- net/batman-adv/trace.c | 2 +- net/batman-adv/trace.h | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/translation-table.h | 2 +- net/batman-adv/tvlv.c | 2 +- net/batman-adv/tvlv.h | 2 +- net/batman-adv/types.h | 2 +- 63 files changed, 63 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 2a15f01c2243..0ae34c85ef9e 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 67f4636758af..617c180ff0c8 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index d5028af750d5..045b2b183213 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index fd63e116d9ff..daa49af7ff40 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index fa39eaaab9d7..382fbe51fd34 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 37898da8ad48..686a60bc9492 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus LĂ¼ssing */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5b0b20e6da95..f0209505e41a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index c7a9ba305bfc..0c57c1000c64 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 4ff6cf1ecae7..0ecaf1bb0068 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Linus LĂ¼ssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 37833db098e6..5e0be10bc84e 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus LĂ¼ssing */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 6536e8cc53a0..1e3172db7492 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Linus LĂ¼ssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 1a29505f4f66..4358d436be2a 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Linus LĂ¼ssing, Marek Lindner */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 714ce56cfcc8..969466218999 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index bf16d040461d..0ae2575f70bb 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 7f04a6acf14e..4bc695cda397 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 84ad2d2b6ac9..533c6d44cb58 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 0eff33580f95..41cc87f06b14 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich */ diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 02b24a861a85..41edb2c4a327 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich */ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 38c4d8e51155..452856c27d20 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 1c5afd301ce9..7e2e8f586f42 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 5004e38fe792..906011b60d66 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 67c7729add55..2bff2f4a325c 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors: * * Antonio Quartulli */ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 385fccdcf69d..7cad97644d05 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll */ diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index abfe8c6556de..881ef328b6cd 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 47df4c678988..e22e49289677 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 0be8e7178ec7..88b5dba84354 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index fc55750542e4..16cd9450ceb1 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 211b14b37db8..c3a0c5a7f7e9 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index afb52282d5bd..c7e98a40dd33 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index bbb8a6f18d6b..bad2e50135e8 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index a9d4e176f4de..68638e0450a6 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 57877f0b78e0..91ae9f32b580 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 0a70b66e8770..ccb535c77e5d 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 27fafff586df..6abd0f4742ef 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 11941cf1adcc..a67b2b091447 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 41fd900121c5..f9884dc56cf3 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4a89177def64..0b840e1e3dfa 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index fd8c0728ddc7..692306df7b6f 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index f9ec8e7507b6..9ebdc1e864b9 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: * * Linus LĂ¼ssing */ diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 5d9e2bb29c97..ebf825991ecd 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors: * * Linus LĂ¼ssing */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 7e052d6f759b..02ed073f95a9 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index ddc674e47dbb..7ee48f916997 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: * * Matthias Schiffer */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 580609389f0f..8f0717c3f7b5 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 753fa49723cf..334289084127 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 38613487fb1b..5b0c2fffc214 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 512a1f99dd75..7bc01c138b3a 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f0f864820dea..3632bd976c56 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index c20feac95107..2ed49db6eff5 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3ce5f7bad369..7f8ade04e08e 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 5fc0fd1e5d08..0d36e15589f6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 832e156c519e..5f05a728f347 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 29139ad769fe..534e08d6ad91 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index e5bbc28ed12c..c45962d8527b 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 5e466093dfa5..d987f8b30a98 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Marek Lindner */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index dd6a9a40dbb9..bd2ac570c42c 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 78d310da0ad3..140105215aa2 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 3cedd2c36528..3444d9e4e90d 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index d8f764521c0b..f631b1e01b89 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors: * * Sven Eckelmann */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8a482c5ec67b..852932838ddc 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 4a98860d7f0e..b24d35b9226a 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index aae63f0d21eb..0963a43ad996 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 36985000a0a8..d509d00c7a23 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index bdf9827b5f63..4a17a66cc572 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ -- cgit v1.2.3 From b2e55ca89245fccd459a2d56850822a69a6f91da Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 1 Jan 2020 00:51:33 +0100 Subject: batman-adv: Disable CONFIG_BATMAN_ADV_SYSFS by default The sysfs support in batman-adv is deprecated since a while and will be removed completely next year. All tools which were known to the batman-adv development team are supporting the batman-adv netlink interface since a while. Thus disabling CONFIG_BATMAN_ADV_SYSFS by default should not cause problems on most systems. It is still possible to enable it in case it is still required in a specific setup. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 045b2b183213..c762758a4649 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -100,7 +100,6 @@ config BATMAN_ADV_DEBUG config BATMAN_ADV_SYSFS bool "batman-adv sysfs entries" depends on BATMAN_ADV - default y help Say Y here if you want to enable batman-adv device configuration and status interface through sysfs attributes. It is replaced by the -- cgit v1.2.3 From 44768decb7c0a6c7c01995dbacc864f3921b6dac Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 27 Dec 2019 18:13:18 +0100 Subject: page_pool: handle page recycle for NUMA_NO_NODE condition The check in pool_page_reusable (page_to_nid(page) == pool->p.nid) is not valid if page_pool was configured with pool->p.nid = NUMA_NO_NODE. The goal of the NUMA changes in commit d5394610b1ba ("page_pool: Don't recycle non-reusable pages"), were to have RX-pages that belongs to the same NUMA node as the CPU processing RX-packet during softirq/NAPI. As illustrated by the performance measurements. This patch moves the NAPI checks out of fast-path, and at the same time solves the NUMA_NO_NODE issue. First realize that alloc_pages_node() with pool->p.nid = NUMA_NO_NODE will lookup current CPU nid (Numa ID) via numa_mem_id(), which is used as the the preferred nid. It is only in rare situations, where e.g. NUMA zone runs dry, that page gets doesn't get allocated from preferred nid. The page_pool API allows drivers to control the nid themselves via controlling pool->p.nid. This patch moves the NAPI check to when alloc cache is refilled, via dequeuing/consuming pages from the ptr_ring. Thus, we can allow placing pages from remote NUMA into the ptr_ring, as the dequeue/consume step will check the NUMA node. All current drivers using page_pool will alloc/refill RX-ring from same CPU running softirq/NAPI process. Drivers that control the nid explicitly, also use page_pool_update_nid when changing nid runtime. To speed up transision to new nid the alloc cache is now flushed on nid changes. This force pages to come from ptr_ring, which does the appropate nid check. For the NUMA_NO_NODE case, when a NIC IRQ is moved to another NUMA node, we accept that transitioning the alloc cache doesn't happen immediately. The preferred nid change runtime via consulting numa_mem_id() based on the CPU processing RX-packets. Notice, to avoid stressing the page buddy allocator and avoid doing too much work under softirq with preempt disabled, the NUMA check at ptr_ring dequeue will break the refill cycle, when detecting a NUMA mismatch. This will cause a slower transition, but its done on purpose. Fixes: d5394610b1ba ("page_pool: Don't recycle non-reusable pages") Reported-by: Li RongQing Reported-by: Yunsheng Lin Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/page_pool.c | 80 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a6aefe989043..515bfb97fda1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -96,10 +96,60 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); +static void __page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, + bool refill) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + int pref_nid; /* preferred NUMA node */ + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. + */ + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; + + /* Slower-path: Get pages from locked ring queue */ + spin_lock(&r->consumer_lock); + + /* Refill alloc array, but only if NUMA match */ + do { + page = __ptr_ring_consume(r); + if (unlikely(!page)) + break; + + if (likely(page_to_nid(page) == pref_nid)) { + pool->alloc.cache[pool->alloc.count++] = page; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + __page_pool_return_page(pool, page); + page = NULL; + break; + } + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && + refill); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + + spin_unlock(&r->consumer_lock); + return page; +} + /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - struct ptr_ring *r = &pool->ring; bool refill = false; struct page *page; @@ -113,20 +163,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) refill = true; } - /* Quicker fallback, avoid locks when ring is empty */ - if (__ptr_ring_empty(r)) - return NULL; - - /* Slow-path: Get page from locked ring queue, - * refill alloc array if requested. - */ - spin_lock(&r->consumer_lock); - page = __ptr_ring_consume(r); - if (refill) - pool->alloc.count = __ptr_ring_consume_batched(r, - pool->alloc.cache, - PP_ALLOC_CACHE_REFILL); - spin_unlock(&r->consumer_lock); + page = page_pool_refill_alloc_cache(pool, refill); return page; } @@ -311,13 +348,10 @@ static bool __page_pool_recycle_direct(struct page *page, /* page is NOT reusable when: * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - * 2) belongs to a different NUMA node than pool->p.nid. - * - * To update pool->p.nid users must call page_pool_update_nid. */ static bool pool_page_reusable(struct page_pool *pool, struct page *page) { - return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; + return !page_is_pfmemalloc(page); } void __page_pool_put_page(struct page_pool *pool, struct page *page, @@ -484,7 +518,15 @@ EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { + struct page *page; + trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; + + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_update_nid); -- cgit v1.2.3 From f13fc10785bc04f83fe0d7730beb8ff4be563a20 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 27 Dec 2019 18:13:24 +0100 Subject: page_pool: help compiler remove code in case CONFIG_NUMA=n When kernel is compiled without NUMA support, then page_pool NUMA config setting (pool->p.nid) doesn't make any practical sense. The compiler cannot see that it can remove the code paths. This patch avoids reading pool->p.nid setting in case of !CONFIG_NUMA, in allocation and numa check code, which helps compiler to see the optimisation potential. It leaves update code intact to keep API the same. $ ./scripts/bloat-o-meter net/core/page_pool.o-numa-enabled \ net/core/page_pool.o-numa-disabled add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-113 (-113) Function old new delta page_pool_create 401 398 -3 __page_pool_alloc_pages_slow 439 426 -13 page_pool_refill_alloc_cache 425 328 -97 Total: Before=3611, After=3498, chg -3.13% Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/page_pool.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 515bfb97fda1..9b7cbe35df37 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -113,7 +113,12 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ +#ifdef CONFIG_NUMA pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif /* Slower-path: Get pages from locked ring queue */ spin_lock(&r->consumer_lock); @@ -200,7 +205,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, */ /* Cache was empty, do real allocation */ +#ifdef CONFIG_NUMA page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); +#else + page = alloc_pages(gfp, pool->p.order); +#endif if (!page) return NULL; -- cgit v1.2.3 From cea9760950a50fa13b6535fbaa6fa91223394431 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Dec 2019 14:14:25 -0800 Subject: ipv4/tcp: Use local variable for tcp_md5_addr Extract the typecast to (union tcp_md5_addr *) to a local variable rather than the current long, inline declaration with function calls. No functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6f52e5288a6f..c2bfff528578 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -701,9 +701,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + const union tcp_md5_addr *addr; + + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk, addr, AF_INET); } else if (hash_location) { + const union tcp_md5_addr *addr; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -720,8 +724,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk1) goto out; - key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk1, addr, AF_INET); if (!key) goto out; @@ -905,6 +909,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + const union tcp_md5_addr *addr; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -916,14 +922,14 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, - AF_INET), + tcp_md5_do_lookup(sk, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } @@ -1149,6 +1155,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + const union tcp_md5_addr *addr; u8 prefixlen = 32; if (optlen < sizeof(cmd)) @@ -1167,16 +1174,16 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } + addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; + if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, - GFP_KERNEL); + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1301,11 +1308,12 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); + const union tcp_md5_addr *addr; int genhash; unsigned char newhash[16]; - hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, - AF_INET); + addr = (union tcp_md5_addr *)&iph->saddr; + hash_expected = tcp_md5_do_lookup(sk, addr, AF_INET); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -1419,6 +1427,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG + const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; #endif struct ip_options_rcu *inet_opt; @@ -1468,8 +1477,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET); + addr = (union tcp_md5_addr *)&newinet->inet_daddr; + key = tcp_md5_do_lookup(sk, addr, AF_INET); if (key) { /* * We're using one, so create a matching key @@ -1477,8 +1486,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); + tcp_md5_do_add(newsk, addr, AF_INET, 32, + key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif -- cgit v1.2.3 From d14c77e0b24f6d1394d9bd4c4ba629808b656baf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Dec 2019 14:14:26 -0800 Subject: ipv6/tcp: Pass dif and sdif to tcp_v6_inbound_md5_hash The original ingress device index is saved to the cb space of the skb and the cb is moved during tcp processing. Since tcp_v6_inbound_md5_hash can be called before and after the cb move, pass dif and sdif to it so the caller can save both prior to the cb move. Both are used by a later patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index df5fd9109696..e94f23b61b62 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -698,7 +698,8 @@ clear_hash_noput: #endif static bool tcp_v6_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; @@ -953,6 +954,9 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk && sk_fullsock(sk)) { key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); } else if (hash_location) { + int dif = tcp_v6_iif_l3_slave(skb); + int sdif = tcp_v6_sdif(skb); + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -964,9 +968,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, - ntohs(th->source), - tcp_v6_iif_l3_slave(skb), - tcp_v6_sdif(skb)); + ntohs(th->source), dif, sdif); if (!sk1) goto out; @@ -1480,6 +1482,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); + int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; @@ -1528,7 +1531,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (tcp_v6_inbound_md5_hash(sk, skb)) { + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1583,7 +1586,7 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v6_inbound_md5_hash(sk, skb)) + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; if (tcp_filter(sk, skb)) -- cgit v1.2.3 From 534322ca3daf56a27c9712ae9b1fef4daede5516 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Dec 2019 14:14:27 -0800 Subject: ipv4/tcp: Pass dif and sdif to tcp_v4_inbound_md5_hash The original ingress device index is saved to the cb space of the skb and the cb is moved during tcp processing. Since tcp_v4_inbound_md5_hash can be called before and after the cb move, pass dif and sdif to it so the caller can save both prior to the cb move. Both are used by a later patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c2bfff528578..93f220793add 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -707,6 +707,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) key = tcp_md5_do_lookup(sk, addr, AF_INET); } else if (hash_location) { const union tcp_md5_addr *addr; + int sdif = tcp_v4_sdif(skb); + int dif = inet_iif(skb); /* * active side is lost. Try to find listening socket through @@ -718,8 +720,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb), - tcp_v4_sdif(skb)); + ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ if (!sk1) goto out; @@ -1293,7 +1294,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); /* Called with rcu_read_lock() */ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG /* @@ -1817,6 +1819,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); + int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1865,7 +1868,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1923,7 +1926,7 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v4_inbound_md5_hash(sk, skb)) + if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; nf_reset_ct(skb); -- cgit v1.2.3 From dea53bb80e07b9e1641b865493908c20cb8df2ac Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Dec 2019 14:14:28 -0800 Subject: tcp: Add l3index to tcp_md5sig_key and md5 functions Add l3index to tcp_md5sig_key to represent the L3 domain of a key, and add l3index to tcp_md5_do_add and tcp_md5_do_del to fill in the key. With the key now based on an l3index, add the new parameter to the lookup functions and consider the l3index when looking for a match. The l3index comes from the skb when processing ingress packets leveraging the helpers created for socket lookups, tcp_v4_sdif and inet_iif (and the v6 variants). When the sdif index is set it means the packet ingressed a device that is part of an L3 domain and inet_iif points to the VRF device. For egress, the L3 domain is determined from the socket binding and sk_bound_dev_if. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 24 +++++++++--------- net/ipv4/tcp_ipv4.c | 68 +++++++++++++++++++++++++++++++++++--------------- net/ipv6/tcp_ipv6.c | 72 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 113 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index e460ea7f767b..7df37e2fddca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1532,8 +1532,9 @@ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ - union tcp_md5_addr addr; u8 prefixlen; + union tcp_md5_addr addr; + int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; @@ -1577,34 +1578,33 @@ struct tcp_md5sig_pool { int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp); + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen); + int family, u8 prefixlen, int l3index); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG #include extern struct static_key_false tcp_md5_needed; -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); static inline struct tcp_md5sig_key * -tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +tcp_md5_do_lookup(const struct sock *sk, int l3index, + const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed)) return NULL; - return __tcp_md5_do_lookup(sk, addr, family); + return __tcp_md5_do_lookup(sk, l3index, addr, family); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else -static inline struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +static inline struct tcp_md5sig_key * +tcp_md5_do_lookup(const struct sock *sk, int l3index, + const union tcp_md5_addr *addr, int family) { return NULL; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 93f220793add..30b3f19d6301 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -702,13 +702,19 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { const union tcp_md5_addr *addr; + int l3index; + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; - key = tcp_md5_do_lookup(sk, addr, AF_INET); + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } else if (hash_location) { const union tcp_md5_addr *addr; int sdif = tcp_v4_sdif(skb); int dif = inet_iif(skb); + int l3index; /* * active side is lost. Try to find listening socket through @@ -725,8 +731,12 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk1) goto out; + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = sdif ? dif : 0; addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; - key = tcp_md5_do_lookup(sk1, addr, AF_INET); + key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); if (!key) goto out; @@ -911,6 +921,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { const union tcp_md5_addr *addr; + int l3index; /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. @@ -924,13 +935,14 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * Rcv.Wind.Shift bits: */ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, addr, AF_INET), + tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } @@ -990,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); /* Find the Key structure for an address. */ -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { @@ -1010,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - + if (key->l3index && key->l3index != l3index) + continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); match = (key->addr.a4.s_addr & mask) == @@ -1034,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen) + int family, u8 prefixlen, + int l3index) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1053,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; + if (key->l3index && key->l3index != l3index) + continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) return key; @@ -1064,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; + int l3index; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; - return tcp_md5_do_lookup(sk, addr, AF_INET); + return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp) + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -1112,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->keylen = newkeylen; key->family = family; key->prefixlen = prefixlen; + key->l3index = l3index; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1121,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen) + u8 prefixlen, int l3index) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1158,6 +1178,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; const union tcp_md5_addr *addr; u8 prefixlen = 32; + int l3index = 0; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1178,12 +1199,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, addr, AF_INET, prefixlen); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1311,11 +1332,16 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); const union tcp_md5_addr *addr; - int genhash; unsigned char newhash[16]; + int genhash, l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; addr = (union tcp_md5_addr *)&iph->saddr; - hash_expected = tcp_md5_do_lookup(sk, addr, AF_INET); + hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -1341,11 +1367,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" - : ""); + : "", l3index); return true; } return false; @@ -1431,6 +1457,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; + int l3index; #endif struct ip_options_rcu *inet_opt; @@ -1478,9 +1505,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); /* Copy over the MD5 key from the original socket */ addr = (union tcp_md5_addr *)&newinet->inet_daddr; - key = tcp_md5_do_lookup(sk, addr, AF_INET); + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key) { /* * We're using one, so create a matching key @@ -1488,7 +1516,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, addr, AF_INET, 32, + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e94f23b61b62..71ad7d89be0f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -81,7 +81,8 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #else static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr) + const struct in6_addr *addr, + int l3index) { return NULL; } @@ -532,15 +533,22 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr) + const struct in6_addr *addr, + int l3index) { - return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); + return tcp_md5_do_lookup(sk, l3index, + (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { - return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); + int l3index; + + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); + return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, + l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, @@ -548,6 +556,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; + int l3index = 0; u8 prefixlen; if (optlen < sizeof(cmd)) @@ -572,9 +581,9 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen); + AF_INET, prefixlen, l3index); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen); + AF_INET6, prefixlen, l3index); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) @@ -582,12 +591,13 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, cmd.tcpm_key, - cmd.tcpm_keylen, GFP_KERNEL); + AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, + GFP_KERNEL); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, cmd.tcpm_key, - cmd.tcpm_keylen, GFP_KERNEL); + AF_INET6, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -706,10 +716,15 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + int genhash, l3index; u8 newhash[16]; - hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr, l3index); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -733,10 +748,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), - &ip6h->daddr, ntohs(th->dest)); + &ip6h->daddr, ntohs(th->dest), l3index); return true; } #endif @@ -952,10 +967,17 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); } else if (hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); + int l3index; /* * active side is lost. Try to find listening socket through @@ -972,7 +994,12 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk1) goto out; - key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = tcp_v6_sdif(skb) ? dif : 0; + + key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key) goto out; @@ -1042,6 +1069,10 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + int l3index; + + l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -1056,7 +1087,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), 0, 0, sk->sk_priority); } @@ -1128,6 +1159,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; + int l3index; #endif struct flowi6 fl6; @@ -1271,8 +1303,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); + /* Copy over the MD5 key from the original socket */ - key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1280,7 +1314,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, key->key, key->keylen, + AF_INET6, 128, l3index, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } #endif -- cgit v1.2.3 From 6b102db50cdde3ba2f78631ed21222edf3a5fb51 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Dec 2019 14:14:29 -0800 Subject: net: Add device index to tcp_md5sig Add support for userspace to specify a device index to limit the scope of an entry via the TCP_MD5SIG_EXT setsockopt. The existing __tcpm_pad is renamed to tcpm_ifindex and the new field is only checked if the new TCP_MD5SIG_FLAG_IFINDEX is set in tcpm_flags. For now, the device index must point to an L3 master device (e.g., VRF). The API and error handling are setup to allow the constraint to be relaxed in the future to any device index. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 5 +++-- net/ipv4/tcp_ipv4.c | 18 ++++++++++++++++++ net/ipv6/tcp_ipv6.c | 20 +++++++++++++++++++- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 74af1f759cee..d87184e673ca 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -317,14 +317,15 @@ enum { #define TCP_MD5SIG_MAXKEYLEN 80 /* tcp_md5sig extension flags for TCP_MD5SIG_EXT */ -#define TCP_MD5SIG_FLAG_PREFIX 1 /* address prefix length */ +#define TCP_MD5SIG_FLAG_PREFIX 0x1 /* address prefix length */ +#define TCP_MD5SIG_FLAG_IFINDEX 0x2 /* ifindex set */ struct tcp_md5sig { struct __kernel_sockaddr_storage tcpm_addr; /* address associated */ __u8 tcpm_flags; /* extension flags */ __u8 tcpm_prefixlen; /* address prefix */ __u16 tcpm_keylen; /* key length */ - __u32 __tcpm_pad; /* zero */ + int tcpm_ifindex; /* device index for scope */ __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */ }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30b3f19d6301..4adac9c75343 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1196,6 +1196,24 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; if (!cmd.tcpm_keylen) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 71ad7d89be0f..95e4e1e95db2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -578,10 +578,28 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, l3index); + AF_INET, prefixlen, + l3index); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index); } -- cgit v1.2.3 From d0e8bcafc8aff5553beffe55046795f9bab9fe7b Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Thu, 2 Jan 2020 22:02:27 +0800 Subject: tcp: use REXMIT_NEW instead of magic number REXMIT_NEW is a macro for "FRTO-style transmit of unsent/new packets", this patch makes it more readable. Signed-off-by: Mao Wenan Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 88b987ca9ebb..1d1e3493965f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3550,7 +3550,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; - if (unlikely(rexmit == 2)) { + if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) -- cgit v1.2.3 From 4a883ccfba3adefc7123ec1a6da3c7301a56ce97 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 3 Jan 2020 03:48:56 +0000 Subject: ethtool: remove set but not used variable 'lsettings' Fixes gcc '-Wunused-but-set-variable' warning: net/ethtool/linkmodes.c: In function 'ethnl_set_linkmodes': net/ethtool/linkmodes.c:326:32: warning: variable 'lsettings' set but not used [-Wunused-but-set-variable] struct ethtool_link_settings *lsettings; ^ It is never used, so remove it. Reported-by: Hulk Robot Signed-off-by: YueHaibing Reviewed-by: Michal Kubecek Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ethtool/linkmodes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 0b99f494ad3b..96f20be64553 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -323,7 +323,6 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_LINKMODES_MAX + 1]; struct ethtool_link_ksettings ksettings = {}; - struct ethtool_link_settings *lsettings; struct ethnl_req_info req_info = {}; struct net_device *dev; bool mod = false; @@ -354,7 +353,6 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out_ops; } - lsettings = &ksettings.base; ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod); if (ret < 0) -- cgit v1.2.3 From b39c78b2aa09cae05f3a48c11f67b3add0d604de Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 3 Jan 2020 11:51:00 +0800 Subject: net: remove the check argument from __skb_gro_checksum_convert The argument is always ignored, so remove it. Signed-off-by: Li RongQing Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2fd19fb8826d..2741aa35bec6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2826,16 +2826,16 @@ static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) } static inline void __skb_gro_checksum_convert(struct sk_buff *skb, - __sum16 check, __wsum pseudo) + __wsum pseudo) { NAPI_GRO_CB(skb)->csum = ~pseudo; NAPI_GRO_CB(skb)->csum_valid = 1; } -#define skb_gro_checksum_try_convert(skb, proto, check, compute_pseudo) \ +#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_gro_checksum_convert_check(skb)) \ - __skb_gro_checksum_convert(skb, check, \ + __skb_gro_checksum_convert(skb, \ compute_pseudo(skb, proto)); \ } while (0) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 4de7e962d3da..2e6d1b7a7bc9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head, if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; - skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a3908e55ed89..b25e42100ceb 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -480,7 +480,7 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 64b8f05d6735..f0d5fc27d0b5 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -127,7 +127,7 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, ip6_gro_compute_pseudo); skip: -- cgit v1.2.3 From 1f623431100138ffaaae36e122f2727c13de640a Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 3 Jan 2020 09:20:40 +0000 Subject: net: Remove redundant BUG_ON() check in phonet_pernet Passing NULL to phonet_pernet causes a crash via BUG_ON. Dereferencing net in net_generic() also has the same effect. This patch removes the redundant BUG_ON check on the same parameter. Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- net/phonet/pn_dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 49bd76a87461..ac0fae06cc15 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -35,8 +35,6 @@ static unsigned int phonet_net_id __read_mostly; static struct phonet_net *phonet_pernet(struct net *net) { - BUG_ON(!net); - return net_generic(net, phonet_net_id); } -- cgit v1.2.3 From d2e9d229cfbd73baf33a817a0212233cea7df59a Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 3 Jan 2020 09:28:16 +0000 Subject: l2tp: Remove redundant BUG_ON() check in l2tp_pernet Passing NULL to l2tp_pernet causes a crash via BUG_ON. Dereferencing net in net_generic() also has the same effect. This patch removes the redundant BUG_ON check on the same parameter. Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index f82ea12bac37..c99223cb3338 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -122,8 +122,6 @@ static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) static inline struct l2tp_net *l2tp_pernet(const struct net *net) { - BUG_ON(!net); - return net_generic(net, l2tp_net_id); } -- cgit v1.2.3 From 36278a5d4d354e5d5610aa728831db9e03cc3d8d Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Wed, 11 Dec 2019 01:54:43 +0000 Subject: Bluetooth: Adding a bt_dev_warn_ratelimited macro. The macro will be used to display rate limited warning messages in the log. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 4 ++++ net/bluetooth/lib.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index fabee6db0abb..bd2675266859 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -129,6 +129,8 @@ void bt_warn(const char *fmt, ...); __printf(1, 2) void bt_err(const char *fmt, ...); __printf(1, 2) +void bt_warn_ratelimited(const char *fmt, ...); +__printf(1, 2) void bt_err_ratelimited(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) @@ -147,6 +149,8 @@ void bt_err_ratelimited(const char *fmt, ...); #define bt_dev_dbg(hdev, fmt, ...) \ BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) +#define bt_dev_warn_ratelimited(hdev, fmt, ...) \ + bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__) diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 63e65d9b4b24..c09e0a3a0ed9 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -183,6 +183,22 @@ void bt_err(const char *format, ...) } EXPORT_SYMBOL(bt_err); +void bt_warn_ratelimited(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + pr_warn_ratelimited("%pV", &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_warn_ratelimited); + void bt_err_ratelimited(const char *format, ...) { struct va_format vaf; -- cgit v1.2.3 From 657cc646475b721f5c5bab82e7fd43302c7c8358 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 11 Dec 2019 11:34:36 +0100 Subject: Bluetooth: Remove usage of BT_ERR_RATELIMITED macro The macro is really not needed and can be replaced with either usage of bt_err_ratelimited or bt_dev_err_ratelimited. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 4 +--- net/bluetooth/hci_event.c | 14 ++++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index bd2675266859..e42bb8e03c09 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -138,8 +138,6 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) -#define BT_ERR_RATELIMITED(fmt, ...) bt_err_ratelimited(fmt "\n", ##__VA_ARGS__) - #define bt_dev_info(hdev, fmt, ...) \ BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ @@ -152,7 +150,7 @@ void bt_err_ratelimited(const char *fmt, ...); #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) /* Connection and socket states */ enum { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c1d3a303d97f..1941f120a376 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5451,7 +5451,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } -static u8 ext_evt_type_to_legacy(u16 evt_type) +static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type) { if (evt_type & LE_EXT_ADV_LEGACY_PDU) { switch (evt_type) { @@ -5468,10 +5468,7 @@ static u8 ext_evt_type_to_legacy(u16 evt_type) return LE_ADV_SCAN_RSP; } - BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", - evt_type); - - return LE_ADV_INVALID; + goto invalid; } if (evt_type & LE_EXT_ADV_CONN_IND) { @@ -5491,8 +5488,9 @@ static u8 ext_evt_type_to_legacy(u16 evt_type) evt_type & LE_EXT_ADV_DIRECT_IND) return LE_ADV_NONCONN_IND; - BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", - evt_type); +invalid: + bt_dev_err_ratelimited(hdev, "Unknown advertising packet type: 0x%02x", + evt_type); return LE_ADV_INVALID; } @@ -5510,7 +5508,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) u16 evt_type; evt_type = __le16_to_cpu(ev->evt_type); - legacy_evt_type = ext_evt_type_to_legacy(evt_type); + legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &ev->bdaddr, ev->bdaddr_type, NULL, 0, ev->rssi, -- cgit v1.2.3 From 1efd927d660e6ab02a9cd32fbbe3c7dc47980132 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 2 Jan 2020 15:00:55 -0800 Subject: Bluetooth: Add support for LE PHY Update Complete event This handles LE PHY Update Complete event and store both tx_phy and rx_phy into hci_conn. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_event.c | 27 +++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5bc1e30dedde..07b6ecedc6ce 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2186,6 +2186,14 @@ struct hci_ev_le_direct_adv_info { __s8 rssi; } __packed; +#define HCI_EV_LE_PHY_UPDATE_COMPLETE 0x0c +struct hci_ev_le_phy_update_complete { + __u8 status; + __u16 handle; + __u8 tx_phy; + __u8 rx_phy; +} __packed; + #define HCI_EV_LE_EXT_ADV_REPORT 0x0d struct hci_ev_le_ext_adv_report { __le16 evt_type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b689aceb636b..faebe3859931 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -493,6 +493,8 @@ struct hci_conn { __u16 le_supv_timeout; __u8 le_adv_data[HCI_MAX_AD_LENGTH]; __u8 le_adv_data_len; + __u8 le_tx_phy; + __u8 le_rx_phy; __s8 rssi; __s8 tx_power; __s8 max_tx_power; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1941f120a376..6ddc4a74a5e4 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5718,6 +5718,29 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, hci_dev_unlock(hdev); } +static void hci_le_phy_update_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_le_phy_update_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + if (!ev->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + conn->le_tx_phy = ev->tx_phy; + conn->le_rx_phy = ev->rx_phy; + +unlock: + hci_dev_unlock(hdev); +} + static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_le_meta *le_ev = (void *) skb->data; @@ -5753,6 +5776,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_direct_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_PHY_UPDATE_COMPLETE: + hci_le_phy_update_evt(hdev, skb); + break; + case HCI_EV_LE_EXT_ADV_REPORT: hci_le_ext_adv_report_evt(hdev, skb); break; -- cgit v1.2.3 From 4b6e228e297b73451f3a4b12fb7d0b24d9d32e6f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 2 Jan 2020 15:00:57 -0800 Subject: Bluetooth: Auto tune if input MTU is set to 0 This enables the code to set the input MTU using the underline link packet types when set to 0, previously this would likely be rejected by the remote peer since it would be bellow the minimal of 48 for BR/EDR or 23 for LE, that way it shall be safe to use 0 without causing any side effects. This is convenient for the likes of A2DP transport, see: https://habr.com/en/post/456182/ Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 54 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a845786258a0..1bca608e0170 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1289,6 +1289,9 @@ static void l2cap_le_connect(struct l2cap_chan *chan) if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) return; + if (!chan->imtu) + chan->imtu = chan->conn->mtu; + l2cap_le_flowctl_init(chan, 0); req.psm = chan->psm; @@ -3226,6 +3229,49 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan) chan->ack_win = chan->tx_win; } +static void l2cap_mtu_auto(struct l2cap_chan *chan) +{ + struct hci_conn *conn = chan->conn->hcon; + + chan->imtu = L2CAP_DEFAULT_MIN_MTU; + + /* The 2-DH1 packet has between 2 and 56 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_2DH1)) + chan->imtu = 54; + + /* The 3-DH1 packet has between 2 and 85 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_3DH1)) + chan->imtu = 83; + + /* The 2-DH3 packet has between 2 and 369 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_2DH3)) + chan->imtu = 367; + + /* The 3-DH3 packet has between 2 and 554 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_3DH3)) + chan->imtu = 552; + + /* The 2-DH5 packet has between 2 and 681 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_2DH5)) + chan->imtu = 679; + + /* The 3-DH5 packet has between 2 and 1023 information bytes + * (including the 2-byte payload header) + */ + if (!(conn->pkt_type & HCI_3DH5)) + chan->imtu = 1021; +} + static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_req *req = data; @@ -3255,8 +3301,12 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data } done: - if (chan->imtu != L2CAP_DEFAULT_MTU) - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); + if (chan->imtu != L2CAP_DEFAULT_MTU) { + if (!chan->imtu) + l2cap_mtu_auto(chan); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, + endptr - ptr); + } switch (chan->mode) { case L2CAP_MODE_BASIC: -- cgit v1.2.3 From a68578c20a9667463ee3000402b21644ea62d753 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 4 Jan 2020 02:37:10 +0200 Subject: net: dsa: Make deferred_xmit private to sja1105 There are 3 things that are wrong with the DSA deferred xmit mechanism: 1. Its introduction has made the DSA hotpath ever so slightly more inefficient for everybody, since DSA_SKB_CB(skb)->deferred_xmit needs to be initialized to false for every transmitted frame, in order to figure out whether the driver requested deferral or not (a very rare occasion, rare even for the only driver that does use this mechanism: sja1105). That was necessary to avoid kfree_skb from freeing the skb. 2. Because L2 PTP is a link-local protocol like STP, it requires management routes and deferred xmit with this switch. But as opposed to STP, the deferred work mechanism needs to schedule the packet rather quickly for the TX timstamp to be collected in time and sent to user space. But there is no provision for controlling the scheduling priority of this deferred xmit workqueue. Too bad this is a rather specific requirement for a feature that nobody else uses (more below). 3. Perhaps most importantly, it makes the DSA core adhere a bit too much to the NXP company-wide policy "Innovate Where It Doesn't Matter". The sja1105 is probably the only DSA switch that requires some frames sent from the CPU to be routed to the slave port via an out-of-band configuration (register write) rather than in-band (DSA tag). And there are indeed very good reasons to not want to do that: if that out-of-band register is at the other end of a slow bus such as SPI, then you limit that Ethernet flow's throughput to effectively the throughput of the SPI bus. So hardware vendors should definitely not be encouraged to design this way. We do _not_ want more widespread use of this mechanism. Luckily we have a solution for each of the 3 issues: For 1, we can just remove that variable in the skb->cb and counteract the effect of kfree_skb with skb_get, much to the same effect. The advantage, of course, being that anybody who doesn't use deferred xmit doesn't need to do any extra operation in the hotpath. For 2, we can create a kernel thread for each port's deferred xmit work. If the user switch ports are named swp0, swp1, swp2, the kernel threads will be named swp0_xmit, swp1_xmit, swp2_xmit (there appears to be a 15 character length limit on kernel thread names). With this, the user can change the scheduling priority with chrt $(pidof swp2_xmit). For 3, we can actually move the entire implementation to the sja1105 driver. So this patch deletes the generic implementation from the DSA core and adds a new one, more adequate to the requirements of PTP TX timestamping, in sja1105_main.c. Suggested-by: Florian Fainelli Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 96 ++++++++++++++++++++++++++-------- include/linux/dsa/sja1105.h | 3 ++ include/net/dsa.h | 9 ---- net/dsa/dsa_priv.h | 2 - net/dsa/slave.c | 37 +------------ net/dsa/tag_sja1105.c | 15 +++++- 6 files changed, 93 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 79dd965227bc..61795833c8f5 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1732,6 +1732,16 @@ static int sja1105_setup(struct dsa_switch *ds) static void sja1105_teardown(struct dsa_switch *ds) { struct sja1105_private *priv = ds->priv; + int port; + + for (port = 0; port < SJA1105_NUM_PORTS; port++) { + struct sja1105_port *sp = &priv->ports[port]; + + if (!dsa_is_user_port(ds, port)) + continue; + + kthread_destroy_worker(sp->xmit_worker); + } sja1105_tas_teardown(ds); sja1105_ptp_clock_unregister(ds); @@ -1753,6 +1763,18 @@ static int sja1105_port_enable(struct dsa_switch *ds, int port, return 0; } +static void sja1105_port_disable(struct dsa_switch *ds, int port) +{ + struct sja1105_private *priv = ds->priv; + struct sja1105_port *sp = &priv->ports[port]; + + if (!dsa_is_user_port(ds, port)) + return; + + kthread_cancel_work_sync(&sp->xmit_work); + skb_queue_purge(&sp->xmit_queue); +} + static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot, struct sk_buff *skb, bool takets) { @@ -1811,31 +1833,36 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot, return NETDEV_TX_OK; } +#define work_to_port(work) \ + container_of((work), struct sja1105_port, xmit_work) +#define tagger_to_sja1105(t) \ + container_of((t), struct sja1105_private, tagger_data) + /* Deferred work is unfortunately necessary because setting up the management * route cannot be done from atomit context (SPI transfer takes a sleepable * lock on the bus) */ -static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port, - struct sk_buff *skb) +static void sja1105_port_deferred_xmit(struct kthread_work *work) { - struct sja1105_private *priv = ds->priv; - struct sk_buff *clone; - - mutex_lock(&priv->mgmt_lock); + struct sja1105_port *sp = work_to_port(work); + struct sja1105_tagger_data *tagger_data = sp->data; + struct sja1105_private *priv = tagger_to_sja1105(tagger_data); + int port = sp - priv->ports; + struct sk_buff *skb; - /* The clone, if there, was made by dsa_skb_tx_timestamp */ - clone = DSA_SKB_CB(skb)->clone; + while ((skb = skb_dequeue(&sp->xmit_queue)) != NULL) { + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; - sja1105_mgmt_xmit(ds, port, 0, skb, !!clone); + mutex_lock(&priv->mgmt_lock); - if (!clone) - goto out; + sja1105_mgmt_xmit(priv->ds, port, 0, skb, !!clone); - sja1105_ptp_txtstamp_skb(ds, port, clone); + /* The clone, if there, was made by dsa_skb_tx_timestamp */ + if (clone) + sja1105_ptp_txtstamp_skb(priv->ds, port, clone); -out: - mutex_unlock(&priv->mgmt_lock); - return NETDEV_TX_OK; + mutex_unlock(&priv->mgmt_lock); + } } /* The MAXAGE setting belongs to the L2 Forwarding Parameters table, @@ -1966,6 +1993,7 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .get_sset_count = sja1105_get_sset_count, .get_ts_info = sja1105_get_ts_info, .port_enable = sja1105_port_enable, + .port_disable = sja1105_port_disable, .port_fdb_dump = sja1105_fdb_dump, .port_fdb_add = sja1105_fdb_add, .port_fdb_del = sja1105_fdb_del, @@ -1979,7 +2007,6 @@ static const struct dsa_switch_ops sja1105_switch_ops = { .port_mdb_prepare = sja1105_mdb_prepare, .port_mdb_add = sja1105_mdb_add, .port_mdb_del = sja1105_mdb_del, - .port_deferred_xmit = sja1105_port_deferred_xmit, .port_hwtstamp_get = sja1105_hwtstamp_get, .port_hwtstamp_set = sja1105_hwtstamp_set, .port_rxtstamp = sja1105_port_rxtstamp, @@ -2031,7 +2058,7 @@ static int sja1105_probe(struct spi_device *spi) struct device *dev = &spi->dev; struct sja1105_private *priv; struct dsa_switch *ds; - int rc, i; + int rc, port; if (!dev->of_node) { dev_err(dev, "No DTS bindings for SJA1105 driver\n"); @@ -2096,15 +2123,42 @@ static int sja1105_probe(struct spi_device *spi) return rc; /* Connections between dsa_port and sja1105_port */ - for (i = 0; i < SJA1105_NUM_PORTS; i++) { - struct sja1105_port *sp = &priv->ports[i]; + for (port = 0; port < SJA1105_NUM_PORTS; port++) { + struct sja1105_port *sp = &priv->ports[port]; + struct dsa_port *dp = dsa_to_port(ds, port); + struct net_device *slave; + + if (!dsa_is_user_port(ds, port)) + continue; - dsa_to_port(ds, i)->priv = sp; - sp->dp = dsa_to_port(ds, i); + dp->priv = sp; + sp->dp = dp; sp->data = tagger_data; + slave = dp->slave; + kthread_init_work(&sp->xmit_work, sja1105_port_deferred_xmit); + sp->xmit_worker = kthread_create_worker(0, "%s_xmit", + slave->name); + if (IS_ERR(sp->xmit_worker)) { + rc = PTR_ERR(sp->xmit_worker); + dev_err(ds->dev, + "failed to create deferred xmit thread: %d\n", + rc); + goto out; + } + skb_queue_head_init(&sp->xmit_queue); } return 0; +out: + while (port-- > 0) { + struct sja1105_port *sp = &priv->ports[port]; + + if (!dsa_is_user_port(ds, port)) + continue; + + kthread_destroy_worker(sp->xmit_worker); + } + return rc; } static int sja1105_remove(struct spi_device *spi) diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 317e05b2584b..fa5735c353cd 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -53,6 +53,9 @@ struct sja1105_skb_cb { ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) struct sja1105_port { + struct kthread_worker *xmit_worker; + struct kthread_work xmit_work; + struct sk_buff_head xmit_queue; struct sja1105_tagger_data *data; struct dsa_port *dp; bool hwts_tx_en; diff --git a/include/net/dsa.h b/include/net/dsa.h index da5578db228e..23b1c58656d4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -90,7 +90,6 @@ struct dsa_device_ops { struct dsa_skb_cb { struct sk_buff *clone; - bool deferred_xmit; }; struct __dsa_skb_cb { @@ -192,9 +191,6 @@ struct dsa_port { struct phylink *pl; struct phylink_config pl_config; - struct work_struct xmit_work; - struct sk_buff_head xmit_queue; - struct list_head list; /* @@ -564,11 +560,6 @@ struct dsa_switch_ops { bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); - /* - * Deferred frame Tx - */ - netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, - struct sk_buff *skb); /* Devlink parameters */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 09ea2fd78c74..8a162605b861 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -162,8 +162,6 @@ int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev); - static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 78ffc87dc25e..c1828bdc79dc 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -116,9 +116,6 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - phylink_stop(dp->pl); dsa_port_disable(dp); @@ -518,7 +515,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); - DSA_SKB_CB(skb)->deferred_xmit = false; DSA_SKB_CB(skb)->clone = NULL; /* Identify PTP protocol packets, clone them, and pass them to the @@ -531,39 +527,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ nskb = p->xmit(skb, dev); if (!nskb) { - if (!DSA_SKB_CB(skb)->deferred_xmit) - kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } -void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - DSA_SKB_CB(skb)->deferred_xmit = true; - - skb_queue_tail(&dp->xmit_queue, skb); - schedule_work(&dp->xmit_work); - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_defer_xmit); - -static void dsa_port_xmit_work(struct work_struct *work) -{ - struct dsa_port *dp = container_of(work, struct dsa_port, xmit_work); - struct dsa_switch *ds = dp->ds; - struct sk_buff *skb; - - if (unlikely(!ds->ops->port_deferred_xmit)) - return; - - while ((skb = skb_dequeue(&dp->xmit_queue)) != NULL) - ds->ops->port_deferred_xmit(ds, dp->index, skb); -} - /* ethtool operations *******************************************************/ static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -1367,9 +1337,6 @@ int dsa_slave_suspend(struct net_device *slave_dev) if (!netif_running(slave_dev)) return 0; - cancel_work_sync(&dp->xmit_work); - skb_queue_purge(&dp->xmit_queue); - netif_device_detach(slave_dev); rtnl_lock(); @@ -1455,8 +1422,6 @@ int dsa_slave_create(struct dsa_port *port) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - INIT_WORK(&port->xmit_work, dsa_port_xmit_work); - skb_queue_head_init(&port->xmit_queue); p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 63ef2a14c934..7c2b84393cc6 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -83,6 +83,19 @@ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) return false; } +/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ +static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, + struct sk_buff *skb) +{ + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + skb_queue_tail(&sp->xmit_queue, skb_get(skb)); + kthread_queue_work(sp->xmit_worker, &sp->xmit_work); + + return NULL; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -97,7 +110,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, * is the .port_deferred_xmit driver callback. */ if (unlikely(sja1105_is_link_local(skb))) - return dsa_defer_xmit(skb, netdev); + return sja1105_defer_xmit(dp->priv, skb); /* If we are under a vlan_filtering bridge, IP termination on * switch ports based on 802.1Q tags is simply too brittle to -- cgit v1.2.3 From 2821d50fc0c45e45bc10781d4dd332e21e7fb980 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 4 Jan 2020 02:37:11 +0200 Subject: net: dsa: tag_sja1105: Slightly improve the Xmas tree in sja1105_xmit This is a cosmetic patch that makes the dp, tx_vid, queue_mapping and pcp local variable definitions a bit closer in length, so they don't look like an eyesore as much. The 'ds' variable is not used otherwise, except for ds->dp. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 7c2b84393cc6..5366ea430349 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -100,8 +100,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - struct dsa_switch *ds = dp->ds; - u16 tx_vid = dsa_8021q_tx_vid(ds, dp->index); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); -- cgit v1.2.3 From 787cac3f5a650fd3184a41c5a27a2fe9ded833aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 6 Jan 2020 03:34:12 +0200 Subject: net: dsa: Pass pcs_poll flag from driver to PHYLINK The DSA drivers that implement .phylink_mac_link_state should normally register an interrupt for the PCS, from which they should call phylink_mac_change(). However not all switches implement this, and those who don't should set this flag in dsa_switch in the .setup callback, so that PHYLINK will poll for a few ms until the in-band AN link timer expires and the PCS state settles. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ net/dsa/port.c | 1 + 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 23b1c58656d4..0c39fed8cd99 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -279,6 +279,11 @@ struct dsa_switch { */ bool vlan_filtering; + /* MAC PCS does not provide link state change interrupt, and requires + * polling. Flag passed on to PHYLINK. + */ + bool pcs_poll; + size_t num_ports; }; diff --git a/net/dsa/port.c b/net/dsa/port.c index ffb5601f7ed6..774facb8d547 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -599,6 +599,7 @@ static int dsa_port_phylink_register(struct dsa_port *dp) dp->pl_config.dev = ds->dev; dp->pl_config.type = PHYLINK_DEV; + dp->pl_config.pcs_poll = ds->pcs_poll; dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); -- cgit v1.2.3 From 53ebeca24a8749f539f1957c5de07fdb031b09ed Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Jan 2020 15:24:15 +0000 Subject: net/rose: remove redundant assignment to variable failed The variable failed is being assigned a value that is never read, the following goto statement jumps to the end of the function and variable failed is not referenced at all. Remove the redundant assignment. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Dan Carpenter Signed-off-by: David S. Miller --- net/rose/rose_route.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index c53307623236..5277631fa14c 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -696,7 +696,6 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; - failed = 0; goto out; } failed = 1; -- cgit v1.2.3 From 788d10c02f7ed1c971445da5e2addfc70f620311 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Jan 2020 18:00:13 +0000 Subject: Bluetooth: remove redundant assignment to variable icid Variable icid is being rc is assigned with a value that is never read. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Simon Horman Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 1bca608e0170..195459a1e53e 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5081,7 +5081,6 @@ static inline int l2cap_move_channel_req(struct l2cap_conn *conn, chan->move_role = L2CAP_MOVE_ROLE_RESPONDER; l2cap_move_setup(chan); chan->move_id = req->dest_amp_id; - icid = chan->dcid; if (req->dest_amp_id == AMP_ID_BREDR) { /* Moving to BR/EDR */ -- cgit v1.2.3 From 7c7b58d46b7636ff3cfb01cc633dda852fda70c8 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Tue, 7 Jan 2020 11:30:33 -0800 Subject: net/ncsi: Send device address as source address After receiving device mac address from device, send this as a source address for further commands instead of broadcast address. This will help in multi host NIC cards. Signed-off-by: Vijay Khemka Signed-off-by: David S. Miller --- net/ncsi/ncsi-cmd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 0187e65176c0..ba9ae482141b 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -369,7 +369,15 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) eh = skb_push(nr->cmd, sizeof(*eh)); eh->h_proto = htons(ETH_P_NCSI); eth_broadcast_addr(eh->h_dest); - eth_broadcast_addr(eh->h_source); + + /* If mac address received from device then use it for + * source address as unicast address else use broadcast + * address as source address + */ + if (nca->ndp->gma_flag == 1) + memcpy(eh->h_source, nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + else + eth_broadcast_addr(eh->h_source); /* Start the timer for the request that might not have * corresponding response. Given NCSI is an internal -- cgit v1.2.3 From b9ae51273655a72a12fba730843fd72fb132735a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 7 Jan 2020 21:03:39 +0100 Subject: hsr: fix dummy hsr_debugfs_rename() declaration The hsr_debugfs_rename prototype got an extra 'void' that needs to be removed again: In file included from /git/arm-soc/net/hsr/hsr_main.c:12: net/hsr/hsr_main.h:194:20: error: two or more data types in declaration specifiers static inline void void hsr_debugfs_rename(struct net_device *dev) Fixes: 4c2d5e33dcd3 ("hsr: rename debugfs file when interface name is changed") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/hsr/hsr_main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index d40de84a637f..754d84b217f0 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -191,7 +191,7 @@ void hsr_debugfs_term(struct hsr_priv *priv); void hsr_debugfs_create_root(void); void hsr_debugfs_remove_root(void); #else -static inline void void hsr_debugfs_rename(struct net_device *dev) +static inline void hsr_debugfs_rename(struct net_device *dev) { } static inline void hsr_debugfs_init(struct hsr_priv *priv, -- cgit v1.2.3 From 542d3065f2b1a60a0cfc259c9a36faa470761d78 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Jan 2020 22:44:43 +0100 Subject: socket: fix unused-function warning When procfs is disabled, the fdinfo code causes a harmless warning: net/socket.c:1000:13: error: 'sock_show_fdinfo' defined but not used [-Werror=unused-function] static void sock_show_fdinfo(struct seq_file *m, struct file *f) Move the function definition up so we can use a single #ifdef around it. Fixes: b4653342b151 ("net: Allow to show socket-specific information in /proc/[pid]/fdinfo/[fd]") Suggested-by: Al Viro Acked-by: Kirill Tkhai Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/socket.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 51bf34995bcb..b79a05de7c6e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -128,7 +128,18 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -static void sock_show_fdinfo(struct seq_file *m, struct file *f); + +#ifdef CONFIG_PROC_FS +static void sock_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct socket *sock = f->private_data; + + if (sock->ops->show_fdinfo) + sock->ops->show_fdinfo(m, sock); +} +#else +#define sock_show_fdinfo NULL +#endif /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear @@ -151,9 +162,7 @@ static const struct file_operations socket_file_ops = { .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, -#ifdef CONFIG_PROC_FS .show_fdinfo = sock_show_fdinfo, -#endif }; /* @@ -997,14 +1006,6 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) return res; } -static void sock_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct socket *sock = f->private_data; - - if (sock->ops->show_fdinfo) - sock->ops->show_fdinfo(m, sock); -} - /* * Atomic setting of ioctl hooks to avoid race * with module unload. -- cgit v1.2.3 From 6181e5cb752e5de9f56fbcee3f0206a2c51f1478 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 2 Jan 2020 21:18:09 +0530 Subject: devlink: add support for reporter recovery completion It is possible that a reporter recovery completion do not finish successfully when recovery is triggered via devlink_health_reporter_recover as recovery could be processed in different context. In such scenario an error is returned by driver when recover hook is invoked and successful recovery completion is intimated later. Expose devlink recover done API to update recovery stats. Signed-off-by: Vikas Gupta Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ net/core/devlink.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 47f87b2fcf63..453f45cc1519 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1000,6 +1000,8 @@ int devlink_health_report(struct devlink_health_reporter *reporter, void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); bool devlink_is_reload_failed(const struct devlink *devlink); diff --git a/net/core/devlink.c b/net/core/devlink.c index 4c63c9a4c09e..e686ae67cd96 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4860,6 +4860,14 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, } EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); +void +devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) +{ + reporter->recovery_count++; + reporter->last_recovery_ts = jiffies; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); + static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) @@ -4876,9 +4884,8 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, if (err) return err; - reporter->recovery_count++; + devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - reporter->last_recovery_ts = jiffies; return 0; } -- cgit v1.2.3 From 97ff3bd37face9bc1bc824cc08241fc1f860ff46 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 2 Jan 2020 21:18:10 +0530 Subject: devlink: add devink notification when reporter update health state add a devlink notification when reporter update the health state. Signed-off-by: Vikas Gupta Signed-off-by: David S. Miller --- net/core/devlink.c | 59 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index e686ae67cd96..d30aa47052aa 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4843,23 +4843,6 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); -void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) -{ - if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && - state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) - return; - - if (reporter->health_state == state) - return; - - reporter->health_state = state; - trace_devlink_health_reporter_state_update(reporter->devlink, - reporter->ops->name, state); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); - void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { @@ -5097,6 +5080,48 @@ genlmsg_cancel: return -EMSGSIZE; } +static void devlink_recover_notify(struct devlink_health_reporter *reporter, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_health_reporter_fill(msg, reporter->devlink, + reporter, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(reporter->devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void +devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state state) +{ + if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && + state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) + return; + + if (reporter->health_state == state) + return; + + reporter->health_state = state; + trace_devlink_health_reporter_state_update(reporter->devlink, + reporter->ops->name, state); + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); + static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3 From 4d776482ecc689bdd68627985ac4cb5a6f325953 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jan 2020 21:06:05 -0800 Subject: net: dsa: Get information about stacked DSA protocol It is possible to stack multiple DSA switches in a way that they are not part of the tree (disjoint) but the DSA master of a switch is a DSA slave of another. When that happens switch drivers may have to know this is the case so as to determine whether their tagging protocol has a remove chance of working. This is useful for specific switch drivers such as b53 where devices have been known to be stacked in the wild without the Broadcom tag protocol supporting that feature. This allows b53 to continue supporting those devices by forcing the disabling of Broadcom tags on the outermost switches if necessary. The get_tag_protocol() function is therefore updated to gain an additional enum dsa_tag_protocol argument which denotes the current tagging protocol used by the DSA master we are attached to, else DSA_TAG_PROTO_NONE for the top of the dsa_switch_tree. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 22 ++++++++++++++-------- drivers/net/dsa/b53/b53_priv.h | 4 +++- drivers/net/dsa/dsa_loop.c | 3 ++- drivers/net/dsa/lan9303-core.c | 3 ++- drivers/net/dsa/lantiq_gswip.c | 3 ++- drivers/net/dsa/microchip/ksz8795.c | 3 ++- drivers/net/dsa/microchip/ksz9477.c | 3 ++- drivers/net/dsa/mt7530.c | 3 ++- drivers/net/dsa/mv88e6060.c | 3 ++- drivers/net/dsa/mv88e6xxx/chip.c | 3 ++- drivers/net/dsa/ocelot/felix.c | 3 ++- drivers/net/dsa/qca/ar9331.c | 3 ++- drivers/net/dsa/qca8k.c | 3 ++- drivers/net/dsa/rtl8366rb.c | 3 ++- drivers/net/dsa/sja1105/sja1105_main.c | 3 ++- drivers/net/dsa/vitesse-vsc73xx-core.c | 3 ++- include/net/dsa.h | 3 ++- net/dsa/dsa2.c | 31 +++++++++++++++++++++++++++++-- net/dsa/dsa_priv.h | 1 + net/dsa/slave.c | 4 +--- 20 files changed, 78 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index edacacfc9365..2b530a31ef0f 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -573,9 +573,8 @@ EXPORT_SYMBOL(b53_disable_port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) { - bool tag_en = !(ds->ops->get_tag_protocol(ds, port) == - DSA_TAG_PROTO_NONE); struct b53_device *dev = ds->priv; + bool tag_en = !(dev->tag_protocol == DSA_TAG_PROTO_NONE); u8 hdr_ctl, val; u16 reg; @@ -1876,7 +1875,8 @@ static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port) return ret; } -enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port) +enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mprot) { struct b53_device *dev = ds->priv; @@ -1886,16 +1886,22 @@ enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port) * misses on multicast addresses (TBD). */ if (is5325(dev) || is5365(dev) || is539x(dev) || is531x5(dev) || - !b53_can_enable_brcm_tags(ds, port)) - return DSA_TAG_PROTO_NONE; + !b53_can_enable_brcm_tags(ds, port)) { + dev->tag_protocol = DSA_TAG_PROTO_NONE; + goto out; + } /* Broadcom BCM58xx chips have a flow accelerator on Port 8 * which requires us to use the prepended Broadcom tag type */ - if (dev->chip_id == BCM58XX_DEVICE_ID && port == B53_CPU_PORT) - return DSA_TAG_PROTO_BRCM_PREPEND; + if (dev->chip_id == BCM58XX_DEVICE_ID && port == B53_CPU_PORT) { + dev->tag_protocol = DSA_TAG_PROTO_BRCM_PREPEND; + goto out; + } - return DSA_TAG_PROTO_BRCM; + dev->tag_protocol = DSA_TAG_PROTO_BRCM; +out: + return dev->tag_protocol; } EXPORT_SYMBOL(b53_get_tag_protocol); diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 1877acf05081..3c30f3a7eb29 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -118,6 +118,7 @@ struct b53_device { u8 jumbo_size_reg; int reset_gpio; u8 num_arl_entries; + enum dsa_tag_protocol tag_protocol; /* used ports mask */ u16 enabled_ports; @@ -359,7 +360,8 @@ int b53_mdb_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); int b53_mirror_add(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); -enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port); +enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mprot); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index c8d7ef27fd72..fdcb70b9f0e4 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -61,7 +61,8 @@ struct dsa_loop_priv { static struct phy_device *phydevs[PHY_MAX_ADDR]; static enum dsa_tag_protocol dsa_loop_get_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { dev_dbg(ds->dev, "%s: port: %d\n", __func__, port); diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index e3c333a8f45d..cc17a44dd3a8 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -883,7 +883,8 @@ static int lan9303_check_device(struct lan9303 *chip) /* ---------------------------- DSA -----------------------------------*/ static enum dsa_tag_protocol lan9303_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_LAN9303; } diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 955324968b74..0369c22fe3e1 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -841,7 +841,8 @@ static int gswip_setup(struct dsa_switch *ds) } static enum dsa_tag_protocol gswip_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_GSWIP; } diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 24a5e99f7fd5..47d65b77caf7 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -645,7 +645,8 @@ static void ksz8795_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val) } static enum dsa_tag_protocol ksz8795_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_KSZ8795; } diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 50ffc63d6231..9a51b8a4de5d 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -295,7 +295,8 @@ static void ksz9477_port_init_cnt(struct ksz_device *dev, int port) } static enum dsa_tag_protocol ksz9477_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { enum dsa_tag_protocol proto = DSA_TAG_PROTO_KSZ9477; struct ksz_device *dev = ds->priv; diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index ed1ec10ec62b..022466ca1c19 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1223,7 +1223,8 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, } static enum dsa_tag_protocol -mtk_get_tag_protocol(struct dsa_switch *ds, int port) +mtk_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mp) { struct mt7530_priv *priv = ds->priv; diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index a5a37f47b320..24b8219fd607 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -43,7 +43,8 @@ static const char *mv88e6060_get_name(struct mii_bus *bus, int sw_addr) } static enum dsa_tag_protocol mv88e6060_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol m) { return DSA_TAG_PROTO_TRAILER; } diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 99816ca9e5e4..04ef4d00f134 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5217,7 +5217,8 @@ static struct mv88e6xxx_chip *mv88e6xxx_alloc_chip(struct device *dev) } static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol m) { struct mv88e6xxx_chip *chip = ds->priv; diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index f072dd75cea2..feccb6201660 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -16,7 +16,8 @@ #include "felix.h" static enum dsa_tag_protocol felix_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_OCELOT; } diff --git a/drivers/net/dsa/qca/ar9331.c b/drivers/net/dsa/qca/ar9331.c index da3bece75e21..de25f99e995a 100644 --- a/drivers/net/dsa/qca/ar9331.c +++ b/drivers/net/dsa/qca/ar9331.c @@ -347,7 +347,8 @@ static void ar9331_sw_port_disable(struct dsa_switch *ds, int port) } static enum dsa_tag_protocol ar9331_sw_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol m) { return DSA_TAG_PROTO_AR9331; } diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index e548289df31e..9f4205b4439b 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1017,7 +1017,8 @@ qca8k_port_fdb_dump(struct dsa_switch *ds, int port, } static enum dsa_tag_protocol -qca8k_get_tag_protocol(struct dsa_switch *ds, int port) +qca8k_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_QCA; } diff --git a/drivers/net/dsa/rtl8366rb.c b/drivers/net/dsa/rtl8366rb.c index f5cc8b0a7c74..fd1977590cb4 100644 --- a/drivers/net/dsa/rtl8366rb.c +++ b/drivers/net/dsa/rtl8366rb.c @@ -964,7 +964,8 @@ static int rtl8366rb_setup(struct dsa_switch *ds) } static enum dsa_tag_protocol rtl8366_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { /* For now, the RTL switches are handled without any custom tags. * diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 61795833c8f5..784e6b8166a0 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1534,7 +1534,8 @@ static int sja1105_setup_8021q_tagging(struct dsa_switch *ds, bool enabled) } static enum dsa_tag_protocol -sja1105_get_tag_protocol(struct dsa_switch *ds, int port) +sja1105_get_tag_protocol(struct dsa_switch *ds, int port, + enum dsa_tag_protocol mp) { return DSA_TAG_PROTO_SJA1105; } diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index 69fc0110ce04..6e21a2a5cf01 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -542,7 +542,8 @@ static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum, } static enum dsa_tag_protocol vsc73xx_get_tag_protocol(struct dsa_switch *ds, - int port) + int port, + enum dsa_tag_protocol mp) { /* The switch internally uses a 8 byte header with length, * source port, tag, LPA and priority. This is supposedly diff --git a/include/net/dsa.h b/include/net/dsa.h index 0c39fed8cd99..63495e3443ac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -380,7 +380,8 @@ typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, - int port); + int port, + enum dsa_tag_protocol mprot); int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c66abbed4daf..c6d81f2baf4e 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -614,6 +614,32 @@ static int dsa_port_parse_dsa(struct dsa_port *dp) return 0; } +static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, + struct net_device *master) +{ + enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE; + struct dsa_switch *mds, *ds = dp->ds; + unsigned int mdp_upstream; + struct dsa_port *mdp; + + /* It is possible to stack DSA switches onto one another when that + * happens the switch driver may want to know if its tagging protocol + * is going to work in such a configuration. + */ + if (dsa_slave_dev_check(master)) { + mdp = dsa_slave_to_port(master); + mds = mdp->ds; + mdp_upstream = dsa_upstream_port(mds, mdp->index); + tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream, + DSA_TAG_PROTO_NONE); + } + + /* If the master device is not itself a DSA slave in a disjoint DSA + * tree, then return immediately. + */ + return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); +} + static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) { struct dsa_switch *ds = dp->ds; @@ -621,20 +647,21 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; - tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); + tag_protocol = dsa_get_tag_protocol(dp, master); tag_ops = dsa_tag_driver_get(tag_protocol); if (IS_ERR(tag_ops)) { if (PTR_ERR(tag_ops) == -ENOPROTOOPT) return -EPROBE_DEFER; dev_warn(ds->dev, "No tagger for this switch\n"); + dp->master = NULL; return PTR_ERR(tag_ops); } + dp->master = master; dp->type = DSA_PORT_TYPE_CPU; dp->filter = tag_ops->filter; dp->rcv = tag_ops->rcv; dp->tag_ops = tag_ops; - dp->master = master; dp->dst = dst; return 0; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 8a162605b861..a7662e7a691d 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -157,6 +157,7 @@ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); +bool dsa_slave_dev_check(const struct net_device *dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c1828bdc79dc..088c886e609e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,8 +22,6 @@ #include "dsa_priv.h" -static bool dsa_slave_dev_check(const struct net_device *dev); - /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -1473,7 +1471,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) free_netdev(slave_dev); } -static bool dsa_slave_dev_check(const struct net_device *dev) +bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; } -- cgit v1.2.3 From a6dd04807ce4d7b71920d996e6b84fa3733a1bfd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jan 2020 08:39:48 +0300 Subject: ethtool: fix a memory leak in ethnl_default_start() If ethnl_default_parse() fails then we need to free a couple memory allocations before returning. Fixes: 728480f12442 ("ethtool: default handlers for GET requests") Signed-off-by: Dan Carpenter Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 4ca96c7b86b3..5d16436498ac 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -472,8 +472,8 @@ static int ethnl_default_start(struct netlink_callback *cb) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { - kfree(req_info); - return -ENOMEM; + ret = -ENOMEM; + goto free_req_info; } ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops, @@ -487,7 +487,7 @@ static int ethnl_default_start(struct netlink_callback *cb) req_info->dev = NULL; } if (ret < 0) - return ret; + goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; @@ -496,6 +496,13 @@ static int ethnl_default_start(struct netlink_callback *cb) ctx->pos_idx = 0; return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; } /* default ->done() handler for GET requests */ -- cgit v1.2.3 From d97772dbd7727655aa894f1610e983e73678212e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jan 2020 08:41:25 +0300 Subject: ethtool: fix ->reply_size() error handling The "ret < 0" comparison is never true because "ret" is still zero. Fixes: 728480f12442 ("ethtool: default handlers for GET requests") Signed-off-by: Dan Carpenter Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5d16436498ac..86b79f9bc08d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -319,9 +319,10 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) rtnl_unlock(); if (ret < 0) goto err_cleanup; - reply_len = ops->reply_size(req_info, reply_data); + ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; + reply_len = ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); @@ -555,9 +556,10 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, ret = ops->prepare_data(req_info, reply_data, NULL); if (ret < 0) goto err_cleanup; - reply_len = ops->reply_size(req_info, reply_data); + ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; + reply_len = ret; ret = -ENOMEM; skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) -- cgit v1.2.3 From ac9c41d5a053e71777e8c2191681358499eff04b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jan 2020 08:42:36 +0300 Subject: ethtool: potential NULL dereference in strset_prepare_data() Smatch complains that the NULL checking isn't done consistently: net/ethtool/strset.c:253 strset_prepare_data() error: we previously assumed 'dev' could be null (see line 233) It looks like there is a missing return on this path. Fixes: 71921690f974 ("ethtool: provide string sets with STRSET_GET request") Signed-off-by: Dan Carpenter Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/strset.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 9f2243329015..82a059c13c1c 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -239,6 +239,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, return -EINVAL; } } + return 0; } ret = ethnl_ops_begin(dev); -- cgit v1.2.3 From 0baf26b0fcd74bbfcef53c5d5e8bad2b99c8d0d2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:35:08 -0800 Subject: bpf: tcp: Support tcp_congestion_ops in bpf This patch makes "struct tcp_congestion_ops" to be the first user of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops in bpf. The BPF implemented tcp_congestion_ops can be used like regular kernel tcp-cc through sysctl and setsockopt. e.g. [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic net.ipv4.tcp_congestion_control = bpf_cubic There has been attempt to move the TCP CC to the user space (e.g. CCP in TCP). The common arguments are faster turn around, get away from long-tail kernel versions in production...etc, which are legit points. BPF has been the continuous effort to join both kernel and userspace upsides together (e.g. XDP to gain the performance advantage without bypassing the kernel). The recent BPF advancements (in particular BTF-aware verifier, BPF trampoline, BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc) possible in BPF. It allows a faster turnaround for testing algorithm in the production while leveraging the existing (and continue growing) BPF feature/framework instead of building one specifically for userspace TCP CC. This patch allows write access to a few fields in tcp-sock (in bpf_tcp_ca_btf_struct_access()). The optional "get_info" is unsupported now. It can be added later. One possible way is to output the info with a btf-id to describe the content. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109003508.3856115-1-kafai@fb.com --- include/linux/filter.h | 2 + include/net/tcp.h | 2 + kernel/bpf/bpf_struct_ops_types.h | 7 +- net/core/filter.c | 2 +- net/ipv4/Makefile | 4 + net/ipv4/bpf_tcp_ca.c | 230 ++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_cong.c | 16 +-- net/ipv4/tcp_ipv4.c | 6 +- net/ipv4/tcp_minisocks.c | 4 +- net/ipv4/tcp_output.c | 4 +- 10 files changed, 261 insertions(+), 16 deletions(-) create mode 100644 net/ipv4/bpf_tcp_ca.c (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e6dd960bca..a366a0b64a57 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -843,6 +843,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df37e2fddca..9dd975be7fdf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; @@ -1101,6 +1102,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index 7bb13ff49ec2..066d83ea1c99 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* internal file - do not include directly */ -/* To be filled in a later patch */ +#ifdef CONFIG_BPF_JIT +#ifdef CONFIG_INET +#include +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) +#endif +#endif diff --git a/net/core/filter.c b/net/core/filter.c index 42fd17c48c5f..a702761ef369 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5935,7 +5935,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -static const struct bpf_func_proto * +const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d57ecfaf89d4..9d97bace13c8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o + +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o +endif diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c new file mode 100644 index 000000000000..9c7745b958d7 --- /dev/null +++ b/net/ipv4/bpf_tcp_ca.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include + +static u32 optional_ops[] = { + offsetof(struct tcp_congestion_ops, init), + offsetof(struct tcp_congestion_ops, release), + offsetof(struct tcp_congestion_ops, set_state), + offsetof(struct tcp_congestion_ops, cwnd_event), + offsetof(struct tcp_congestion_ops, in_ack_event), + offsetof(struct tcp_congestion_ops, pkts_acked), + offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, sndbuf_expand), + offsetof(struct tcp_congestion_ops, cong_control), +}; + +static u32 unsupported_ops[] = { + offsetof(struct tcp_congestion_ops, get_info), +}; + +static const struct btf_type *tcp_sock_type; +static u32 tcp_sock_id, sock_id; + +static int bpf_tcp_ca_init(struct btf *btf) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + sock_id = type_id; + + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_sock_id = type_id; + tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + + return 0; +} + +static bool is_optional(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { + if (member_offset == optional_ops[i]) + return true; + } + + return false; +} + +static bool is_unsupported(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { + if (member_offset == unsupported_ops[i]) + return true; + } + + return false; +} + +extern struct btf *btf_vmlinux; + +static bool bpf_tcp_ca_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + if (!btf_ctx_access(off, size, type, prog, info)) + return false; + + if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) + /* promote it to tcp_sock */ + info->btf_id = tcp_sock_id; + + return true; +} + +static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id) +{ + size_t end; + + if (atype == BPF_READ) + return btf_struct_access(log, t, off, size, atype, next_btf_id); + + if (t != tcp_sock_type) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + switch (off) { + case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): + end = offsetofend(struct inet_connection_sock, icsk_ca_priv); + break; + case offsetof(struct inet_connection_sock, icsk_ack.pending): + end = offsetofend(struct inet_connection_sock, + icsk_ack.pending); + break; + case offsetof(struct tcp_sock, snd_cwnd): + end = offsetofend(struct tcp_sock, snd_cwnd); + break; + case offsetof(struct tcp_sock, snd_cwnd_cnt): + end = offsetofend(struct tcp_sock, snd_cwnd_cnt); + break; + case offsetof(struct tcp_sock, snd_ssthresh): + end = offsetofend(struct tcp_sock, snd_ssthresh); + break; + case offsetof(struct tcp_sock, ecn_flags): + end = offsetofend(struct tcp_sock, ecn_flags); + break; + default: + bpf_log(log, "no write support to tcp_sock at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", + off, size, end); + return -EACCES; + } + + return NOT_INIT; +} + +static const struct bpf_func_proto * +bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { + .get_func_proto = bpf_tcp_ca_get_func_proto, + .is_valid_access = bpf_tcp_ca_is_valid_access, + .btf_struct_access = bpf_tcp_ca_btf_struct_access, +}; + +static int bpf_tcp_ca_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct tcp_congestion_ops *utcp_ca; + struct tcp_congestion_ops *tcp_ca; + size_t tcp_ca_name_len; + int prog_fd; + u32 moff; + + utcp_ca = (const struct tcp_congestion_ops *)udata; + tcp_ca = (struct tcp_congestion_ops *)kdata; + + moff = btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct tcp_congestion_ops, flags): + if (utcp_ca->flags & ~TCP_CONG_MASK) + return -EINVAL; + tcp_ca->flags = utcp_ca->flags; + return 1; + case offsetof(struct tcp_congestion_ops, name): + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); + if (!tcp_ca_name_len || + tcp_ca_name_len == sizeof(utcp_ca->name)) + return -EINVAL; + if (tcp_ca_find(utcp_ca->name)) + return -EEXIST; + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); + return 1; + } + + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) + return 0; + + /* Ensure bpf_prog is provided for compulsory func ptr */ + prog_fd = (int)(*(unsigned long *)(udata + moff)); + if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) + return -EINVAL; + + return 0; +} + +static int bpf_tcp_ca_check_member(const struct btf_type *t, + const struct btf_member *member) +{ + if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + return -ENOTSUPP; + return 0; +} + +static int bpf_tcp_ca_reg(void *kdata) +{ + return tcp_register_congestion_control(kdata); +} + +static void bpf_tcp_ca_unreg(void *kdata) +{ + tcp_unregister_congestion_control(kdata); +} + +/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + +struct bpf_struct_ops bpf_tcp_congestion_ops = { + .verifier_ops = &bpf_tcp_ca_verifier_ops, + .reg = bpf_tcp_ca_reg, + .unreg = bpf_tcp_ca_unreg, + .check_member = bpf_tcp_ca_check_member, + .init_member = bpf_tcp_ca_init_member, + .init = bpf_tcp_ca_init, + .name = "tcp_congestion_ops", +}; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3737ec096650..3172e31987be 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - if (unlikely(!try_module_get(ca->owner))) + if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); @@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) - module_put(prev->owner); + bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; @@ -366,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, } else if (!load) { const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (try_module_get(ca->owner)) { + if (bpf_try_module_get(ca, ca->owner)) { if (reinit) { tcp_reinit_congestion_control(sk, ca); } else { icsk->icsk_ca_ops = ca; - module_put(old_ca->owner); + bpf_module_put(old_ca, old_ca->owner); } } else { err = -EBUSY; } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { err = -EBUSY; } else { tcp_reinit_congestion_control(sk, ca); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4adac9c75343..317ccca548a2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2678,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net) int cpu; if (net->ipv4.tcp_congestion_control) - module_put(net->ipv4.tcp_congestion_control->owner); + bpf_module_put(net->ipv4.tcp_congestion_control, + net->ipv4.tcp_congestion_control->owner); for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); @@ -2785,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net) /* Reno is always built in */ if (!net_eq(net, &init_net) && - try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + bpf_try_module_get(init_net.ipv4.tcp_congestion_control, + init_net.ipv4.tcp_congestion_control->owner)) net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; else net->ipv4.tcp_congestion_control = &tcp_reno; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c802bc80c400..ad3b56d9fa71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -414,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; @@ -425,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || - !try_module_get(icsk->icsk_ca_ops->owner))) + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..377cfab422df 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3368,8 +3368,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { - module_put(icsk->icsk_ca_ops->owner); + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } -- cgit v1.2.3 From 206057fe020ac5c037d5e2dd6562a9bd216ec765 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jan 2020 16:45:51 -0800 Subject: bpf: Add BPF_FUNC_tcp_send_ack helper Add a helper to send out a tcp-ack. It will be used in the later bpf_dctcp implementation that requires to send out an ack when the CE state changed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200109004551.3900448-1-kafai@fb.com --- include/uapi/linux/bpf.h | 11 ++++++++++- net/ipv4/bpf_tcp_ca.c | 24 +++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 38059880963e..2d6a2e572f56 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2837,6 +2837,14 @@ union bpf_attr { * Return * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. + * + * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2954,7 +2962,8 @@ union bpf_attr { FN(probe_read_user), \ FN(probe_read_kernel), \ FN(probe_read_user_str), \ - FN(probe_read_kernel_str), + FN(probe_read_kernel_str), \ + FN(tcp_send_ack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9c7745b958d7..574972bc7299 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -143,11 +143,33 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, return NOT_INIT; } +BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) +{ + /* bpf_tcp_ca prog cannot have NULL tp */ + __tcp_send_ack((struct sock *)tp, rcv_nxt); + return 0; +} + +static const struct bpf_func_proto bpf_tcp_send_ack_proto = { + .func = bpf_tcp_send_ack, + .gpl_only = false, + /* In case we want to report error later */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_ANYTHING, + .btf_id = &tcp_sock_id, +}; + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + switch (func_id) { + case BPF_FUNC_tcp_send_ack: + return &bpf_tcp_send_ack_proto; + default: + return bpf_base_func_proto(func_id); + } } static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { -- cgit v1.2.3 From 5e0fcc16e5c563fd8f16738efec974f81e0a5ea0 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Wed, 8 Jan 2020 15:43:40 -0800 Subject: net/ncsi: Support for multi host mellanox card Multi host Mellanox cards require MAC affinity to be set before receiving any config commands. All config commands should also have unicast address for source address in command header. Adding GMA and SMAF(Set Mac Affinity) for Mellanox card and call these in channel probe state machine if it is defined in device tree. Signed-off-by: Vijay Khemka Signed-off-by: David S. Miller --- net/ncsi/internal.h | 20 +++++++++++++++ net/ncsi/ncsi-manage.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index ad3fd7f1da75..e37102546be6 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -64,6 +64,17 @@ enum { NCSI_MODE_MAX }; +/* Supported media status bits for Mellanox Mac affinity command. + * Bit (0-2) for different protocol support; Bit 1 for RBT support, + * bit 1 for SMBUS support and bit 2 for PCIE support. Bit (3-5) + * for different protocol availability. Bit 4 for RBT, bit 4 for + * SMBUS and bit 5 for PCIE. + */ +enum { + MLX_MC_RBT_SUPPORT = 0x01, /* MC supports RBT */ + MLX_MC_RBT_AVL = 0x08, /* RBT medium is available */ +}; + /* OEM Vendor Manufacture ID */ #define NCSI_OEM_MFR_MLX_ID 0x8119 #define NCSI_OEM_MFR_BCM_ID 0x113d @@ -72,9 +83,15 @@ enum { /* Mellanox specific OEM Command */ #define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */ #define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */ +#define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */ +#define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */ /* OEM Command payload lengths*/ #define NCSI_OEM_BCM_CMD_GMA_LEN 12 #define NCSI_OEM_MLX_CMD_GMA_LEN 8 +#define NCSI_OEM_MLX_CMD_SMAF_LEN 60 +/* Offset in OEM request */ +#define MLX_SMAF_MAC_ADDR_OFFSET 8 /* Offset for MAC in SMAF */ +#define MLX_SMAF_MED_SUPPORT_OFFSET 14 /* Offset for medium in SMAF */ /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 #define MLX_MAC_ADDR_OFFSET 8 @@ -251,6 +268,8 @@ enum { ncsi_dev_state_probe_deselect = 0x0201, ncsi_dev_state_probe_package, ncsi_dev_state_probe_channel, + ncsi_dev_state_probe_mlx_gma, + ncsi_dev_state_probe_mlx_smaf, ncsi_dev_state_probe_cis, ncsi_dev_state_probe_gvi, ncsi_dev_state_probe_gc, @@ -311,6 +330,7 @@ struct ncsi_dev_priv { struct list_head vlan_vids; /* List of active VLAN IDs */ bool multi_package; /* Enable multiple packages */ + bool mlx_multi_host; /* Enable multi host Mellanox */ u32 package_whitelist; /* Packages to configure */ }; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index e20b81514029..1f387be7827b 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include @@ -730,6 +732,34 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) +{ + union { + u8 data_u8[NCSI_OEM_MLX_CMD_SMAF_LEN]; + u32 data_u32[NCSI_OEM_MLX_CMD_SMAF_LEN / sizeof(u32)]; + } u; + int ret = 0; + + memset(&u, 0, sizeof(u)); + u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF; + u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM; + memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET], + nca->ndp->ndev.dev->dev_addr, ETH_ALEN); + u.data_u8[MLX_SMAF_MED_SUPPORT_OFFSET] = + (MLX_MC_RBT_AVL | MLX_MC_RBT_SUPPORT); + + nca->payload = NCSI_OEM_MLX_CMD_SMAF_LEN; + nca->data = u.data_u8; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during probe\n", + nca->type); + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; @@ -1310,8 +1340,38 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) break; } nd->state = ncsi_dev_state_probe_cis; + if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) && + ndp->mlx_multi_host) + nd->state = ncsi_dev_state_probe_mlx_gma; + schedule_work(&ndp->work); break; +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) + case ncsi_dev_state_probe_mlx_gma: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_gma_handler_mlx(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_mlx_smaf; + break; + case ncsi_dev_state_probe_mlx_smaf: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_smaf_mlx(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_cis; + break; +#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1621,6 +1681,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, { struct ncsi_dev_priv *ndp; struct ncsi_dev *nd; + struct platform_device *pdev; + struct device_node *np; unsigned long flags; int i; @@ -1667,6 +1729,13 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, /* Set up generic netlink interface */ ncsi_init_netlink(dev); + pdev = to_platform_device(dev->dev.parent); + if (pdev) { + np = pdev->dev.of_node; + if (np && of_get_property(np, "mlx,multi-host", NULL)) + ndp->mlx_multi_host = true; + } + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); -- cgit v1.2.3 From 6b3acfc3cc3d54a99cc5148960edfc38c94a93f2 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 9 Jan 2020 08:59:56 +0800 Subject: flow_dissector: fix document for skb_flow_get_icmp_tci using correct input parameter name to fix the below warning: net/core/flow_dissector.c:242: warning: Function parameter or member 'thoff' not described in 'skb_flow_get_icmp_tci' net/core/flow_dissector.c:242: warning: Excess function parameter 'toff' description in 'skb_flow_get_icmp_tci' Signed-off-by: Li RongQing Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2dbbb030fbed..f560b4902060 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -233,7 +233,7 @@ static bool icmp_has_id(u8 type) * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet - * @toff: offset to extract at + * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, -- cgit v1.2.3 From e9cdced78dc20c1592c1fb98ed064943007a46c5 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:14 -0800 Subject: net: Make sock protocol value checks more specific SK_PROTOCOL_MAX is only used in two places, for DECNet and AX.25. The limits have more to do with the those protocol definitions than they do with the data type of sk_protocol, so remove SK_PROTOCOL_MAX and use U8_MAX directly. Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 1 - net/ax25/af_ax25.c | 2 +- net/decnet/af_decnet.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..091e55428415 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -458,7 +458,6 @@ struct sock { sk_userlocks : 4, sk_protocol : 8, sk_type : 16; -#define SK_PROTOCOL_MAX U8_MAX u16 sk_gso_max_segs; u8 sk_pacing_shift; unsigned long sk_lingertime; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 324306d6fde0..ff57ea89c27e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -808,7 +808,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index e19a92a62e14..0a46ea3bddd5 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -670,7 +670,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) -- cgit v1.2.3 From bf9765145b856fa2e238a5b8a54453795ba30ad6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:15 -0800 Subject: sock: Make sk_protocol a 16-bit value Match the 16-bit width of skbuff->protocol. Fills an 8-bit hole so sizeof(struct sock) does not change. Also take care of BPF field access for sk_type/sk_protocol. Both of them are now outside the bitfield, so we can use load instructions without further shifting/masking. v5 -> v6: - update eBPF accessors, too (Intel's kbuild test robot) v2 -> v3: - keep 'sk_type' 2 bytes aligned (Eric) v1 -> v2: - preserve sk_pacing_shift as bit field (Eric) Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 25 ++++--------------- include/trace/events/sock.h | 2 +- net/core/filter.c | 60 +++++++++++++++++---------------------------- 3 files changed, 28 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 091e55428415..8766f9bc3e70 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,30 +436,15 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ - unsigned int __sk_flags_offset[0]; -#ifdef __BIG_ENDIAN_BITFIELD -#define SK_FL_PROTO_SHIFT 16 -#define SK_FL_PROTO_MASK 0x00ff0000 - -#define SK_FL_TYPE_SHIFT 0 -#define SK_FL_TYPE_MASK 0x0000ffff -#else -#define SK_FL_PROTO_SHIFT 8 -#define SK_FL_PROTO_MASK 0x0000ff00 - -#define SK_FL_TYPE_SHIFT 16 -#define SK_FL_TYPE_MASK 0xffff0000 -#endif - - unsigned int sk_padding : 1, + u8 sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, - sk_userlocks : 4, - sk_protocol : 8, - sk_type : 16; - u16 sk_gso_max_segs; + sk_userlocks : 4; u8 sk_pacing_shift; + u16 sk_type; + u16 sk_protocol; + u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 51fe9f6719eb..3ff12b90048d 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -147,7 +147,7 @@ TRACE_EVENT(inet_sock_set_state, __field(__u16, sport) __field(__u16, dport) __field(__u16, family) - __field(__u8, protocol) + __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) diff --git a/net/core/filter.c b/net/core/filter.c index 42fd17c48c5f..ef01c5599501 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7607,21 +7607,21 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, type): - BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); - *target_size = 2; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_type), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_type, + sizeof_field(struct sock, sk_type), + target_size)); break; case offsetof(struct bpf_sock, protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); - *target_size = 1; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_protocol), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_protocol, + sizeof_field(struct sock, sk_protocol), + target_size)); break; case offsetof(struct bpf_sock, src_ip4): @@ -7903,20 +7903,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_addr, type): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): @@ -8835,11 +8828,11 @@ sk_reuseport_is_valid_access(int off, int size, skb, \ SKB_FIELD) -#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ - struct sock, \ - sk, \ - SK_FIELD, BPF_SIZE, EXTRA_OFF) +#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, @@ -8863,16 +8856,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, - BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); - /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian - * aware. No further narrowing or masking is needed. - */ - *target_size = 1; + SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): -- cgit v1.2.3 From 1323059301c8f36d933876233516245d882346a6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:18 -0800 Subject: tcp, ulp: Add clone operation to tcp_ulp_ops If ULP is used on a listening socket, icsk_ulp_ops and icsk_ulp_data are copied when the listener is cloned. Sometimes the clone is immediately deleted, which will invoke the release op on the clone and likely corrupt the listening socket's icsk_ulp_data. The clone operation is invoked immediately after the clone is copied and gives the ULP type an opportunity to set up the clone socket and its icsk_ulp_data. The MPTCP ULP clone will silently fallback to plain TCP on allocation failure, so 'clone()' does not need to return an error code. v6 -> v7: - move and rename ulp clone helper to make it inline-friendly v5 -> v6: - clarified MPTCP clone usage in commit message Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +++ net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 85f1d7ff6e8b..ac52633e7061 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2154,6 +2154,9 @@ struct tcp_ulp_ops { /* diagnostic */ int (*get_info)(const struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); + /* clone ulp */ + void (*clone)(const struct request_sock *req, struct sock *newsk, + const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18c0d5bffe12..6a691fd04398 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -770,6 +770,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->clone) + icsk->icsk_ulp_ops->clone(req, newsk, priority); +} + /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone @@ -810,6 +822,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + inet_clone_ulp(req, newsk, priority); + security_inet_csk_clone(newsk, req); } return newsk; -- cgit v1.2.3 From 3ee17bc78e0f3fdeff9890993e8f3a9f5145163b Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:19 -0800 Subject: mptcp: Add MPTCP to skb extensions Add enum value for MPTCP and update config dependencies v5 -> v6: - fixed '__unused' field size Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- MAINTAINERS | 10 ++++++++++ include/linux/skbuff.h | 3 +++ include/net/mptcp.h | 28 ++++++++++++++++++++++++++++ net/core/skbuff.c | 7 +++++++ 4 files changed, 48 insertions(+) create mode 100644 include/net/mptcp.h (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index 218089f38e11..9dcb9cab5705 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11573,6 +11573,16 @@ F: net/ipv6/calipso.c F: net/netfilter/xt_CONNSECMARK.c F: net/netfilter/xt_SECMARK.c +NETWORKING [MPTCP] +M: Mat Martineau +M: Matthieu Baerts +L: netdev@vger.kernel.org +L: mptcp@lists.01.org +W: https://github.com/multipath-tcp/mptcp_net-next/wiki +B: https://github.com/multipath-tcp/mptcp_net-next/issues +S: Maintained +F: include/net/mptcp.h + NETWORKING [TCP] M: Eric Dumazet L: netdev@vger.kernel.org diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 64e5b1be9ff5..f5c27600b410 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4096,6 +4096,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, +#endif +#if IS_ENABLED(CONFIG_MPTCP) + SKB_EXT_MPTCP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 index 000000000000..326043c29c0a --- /dev/null +++ b/include/net/mptcp.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __NET_MPTCP_H +#define __NET_MPTCP_H + +#include + +/* MPTCP sk_buff extension data */ +struct mptcp_ext { + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + __unused:3; + /* one byte hole */ +}; + +#endif /* __NET_MPTCP_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44b0894d8ae1..a4106da23c34 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -4109,6 +4110,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), #endif +#if IS_ENABLED(CONFIG_MPTCP) + [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4122,6 +4126,9 @@ static __always_inline unsigned int skb_ext_total_length(void) #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) skb_ext_type_len[TC_SKB_EXT] + +#endif +#if IS_ENABLED(CONFIG_MPTCP) + skb_ext_type_len[SKB_EXT_MPTCP] + #endif 0; } -- cgit v1.2.3 From 85712484110df308215077be6ee21c4e57d7dec2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:20 -0800 Subject: tcp: coalesce/collapse must respect MPTCP extensions Coalesce and collapse of packets carrying MPTCP extensions is allowed when the newer packet has no extension or the extensions carried by both packets are equal. This allows merging of TSO packet trains and even cross-TSO packets, and does not require any additional action when moving data into existing SKBs. v3 -> v4: - allow collapsing, under mptcp_skb_can_collapse() constraint v5 -> v6: - clarify MPTCP skb extensions must always be cleared at allocation time Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 8 ++++++++ net/ipv4/tcp_input.c | 11 +++++++--- net/ipv4/tcp_output.c | 2 +- 4 files changed, 74 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 326043c29c0a..0573ae75c3db 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -8,6 +8,7 @@ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H +#include #include /* MPTCP sk_buff extension data */ @@ -25,4 +26,60 @@ struct mptcp_ext { /* one byte hole */ }; +#ifdef CONFIG_MPTCP + +/* move the skb extension owership, with the assumption that 'to' is + * newly allocated + */ +static inline void mptcp_skb_ext_move(struct sk_buff *to, + struct sk_buff *from) +{ + if (!skb_ext_exist(from, SKB_EXT_MPTCP)) + return; + + if (WARN_ON_ONCE(to->active_extensions)) + skb_ext_put(to); + + to->active_extensions = from->active_extensions; + to->extensions = from->extensions; + from->active_extensions = 0; +} + +static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, + const struct mptcp_ext *from_ext) +{ + /* MPTCP always clears the ext when adding it to the skb, so + * holes do not bother us here + */ + return !from_ext || + (to_ext && from_ext && + !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); +} + +/* check if skbs can be collapsed. + * MPTCP collapse is allowed if neither @to or @from carry an mptcp data + * mapping, or if the extension of @to is the same as @from. + * Collapsing is not possible if @to lacks an extension, but @from carries one. + */ +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), + skb_ext_find(from, SKB_EXT_MPTCP)); +} + +#else + +static inline void mptcp_skb_ext_move(struct sk_buff *to, + const struct sk_buff *from) +{ +} + +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return true; +} + +#endif /* CONFIG_MPTCP */ #endif /* __NET_MPTCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index ac52633e7061..13bc83fab454 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -978,6 +979,13 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) return likely(!TCP_SKB_CB(skb)->eor); } +static inline bool tcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(tcp_skb_can_collapse_to(to) && + mptcp_skb_can_collapse(to, from)); +} + /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 062b696a5fa1..2914fdf1d543 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1422,7 +1422,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; - if (!tcp_skb_can_collapse_to(prev)) + if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -4423,6 +4423,9 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; + if (!mptcp_skb_can_collapse(to, from)) + return false; + #ifdef CONFIG_TLS_DEVICE if (from->decrypted != to->decrypted) return false; @@ -4932,7 +4935,7 @@ restart: /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or - * overlaps to the next one. + * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || @@ -4941,7 +4944,7 @@ restart: break; } - if (n && n != tail && + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -4974,6 +4977,7 @@ restart: else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); + mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4993,6 +4997,7 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || + !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; #ifdef CONFIG_TLS_DEVICE diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..3ce7fe1c4076 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2865,7 +2865,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; - if (!tcp_skb_can_collapse_to(to)) + if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; -- cgit v1.2.3 From 35b2c32116091ef87a15c604cac363da8322a288 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:21 -0800 Subject: tcp: Export TCP functions and ops struct MPTCP will make use of tcp_send_mss() and tcp_push() when sending data to specific TCP subflows. tcp_request_sock_ipvX_ops and ipvX_specific will be referenced during TCP subflow creation. Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++++ net/ipv4/tcp.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 6 +++--- 4 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 13bc83fab454..5e4133d09b9d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -330,6 +330,9 @@ int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); +int tcp_send_mss(struct sock *sk, int *size_goal, int flags); +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, + int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); @@ -2011,6 +2014,11 @@ struct tcp_request_sock_ops { enum tcp_synack_type synack_type); }; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; +#if IS_ENABLED(CONFIG_IPV6) +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; +#endif + #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f09fbc85b108..6711a97de3ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -690,8 +690,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -static void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle, int size_goal) +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -925,7 +925,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4adac9c75343..fedb537839ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1426,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 95e4e1e95db2..5b5260103b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -75,7 +75,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; -static const struct inet_connection_sock_af_ops ipv6_specific; +const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; @@ -819,7 +819,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG @@ -1794,7 +1794,7 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; -static const struct inet_connection_sock_af_ops ipv6_specific = { +const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, -- cgit v1.2.3 From 9cfcca2389d7e07647ee69950f46ab5e6dfe03ac Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:22 -0800 Subject: tcp: Check for filled TCP option space before SACK Update the SACK check to work with zero option space available, a case that's possible with MPTCP but not MD5+TS. Maintained only one conditional branch for insufficient SACK space. v1 -> v2: - Moves the check inside the SACK branch by taking recent SACK fix: 9424e2e7ad93 (tcp: md5: fix potential overestimation of TCP option space) in to account, but modifies it to work in MPTCP scenarios beyond the MD5+TS corner case. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ce7fe1c4076..05109d0c675b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -754,13 +754,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) + return size; + opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); - if (likely(opts->num_sack_blocks)) - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; -- cgit v1.2.3 From 8b69a803814bb8b14155ea60df83f6d57527e69e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jan 2020 07:59:24 -0800 Subject: skb: add helpers to allocate ext independently from sk_buff Currently we can allocate the extension only after the skb, this change allows the user to do the opposite, will simplify allocation failure handling from MPTCP. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ net/core/skbuff.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5c27600b410..016b3c4ab99a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4120,6 +4120,9 @@ struct skb_ext { char data[0] __aligned(8); }; +struct skb_ext *__skb_ext_alloc(void); +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a4106da23c34..48a7029529c9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5987,7 +5987,14 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); } -static struct skb_ext *skb_ext_alloc(void) +/** + * __skb_ext_alloc - allocate a new skb extensions storage + * + * Returns the newly allocated pointer. The pointer can later attached to a + * skb via __skb_ext_set(). + * Note: caller must handle the skb_ext as an opaque data. + */ +struct skb_ext *__skb_ext_alloc(void) { struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); @@ -6027,6 +6034,30 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, return new; } +/** + * __skb_ext_set - attach the specified extension storage to this skb + * @skb: buffer + * @id: extension id + * @ext: extension storage previously allocated via __skb_ext_alloc() + * + * Existing extensions, if any, are cleared. + * + * Returns the pointer to the extension. + */ +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext) +{ + unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); + + skb_ext_put(skb); + newlen = newoff + skb_ext_type_len[id]; + ext->chunks = newlen; + ext->offset[id] = newoff; + skb->extensions = ext; + skb->active_extensions = 1 << id; + return skb_ext_get_ptr(ext, id); +} + /** * skb_ext_add - allocate space for given extension, COW if needed * @skb: buffer @@ -6060,7 +6091,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = skb_ext_alloc(); + new = __skb_ext_alloc(); if (!new) return NULL; } -- cgit v1.2.3 From 4905294162bda43bef65b411cf791005c7e15b28 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 13 Jan 2020 22:39:20 +0100 Subject: netns: Remove __peernet2id_alloc() __peernet2id_alloc() was used for both plain lookups and for netns ID allocations (depending the value of '*alloc'). Let's separate lookups from allocations instead. That is, integrate the lookup code into __peernet2id() and make peernet2id_alloc() responsible for allocating new netns IDs when necessary. This makes it clear that __peernet2id() doesn't modify the idr and prepares the code for lockless lookups. Also, mark the 'net' argument of __peernet2id() as 'const', since we're modifying this line. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/core/net_namespace.c | 55 ++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 39402840025e..05e07d24b45b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -211,16 +211,10 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc - * is set to true, thus the caller knows that the new id must be notified via - * rtnl. - */ -static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) +/* Should be called with nsid_lock held. */ +static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); - bool alloc_it = *alloc; - - *alloc = false; /* Magic value for id 0. */ if (id == NET_ID_ZERO) @@ -228,23 +222,9 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) if (id > 0) return id; - if (alloc_it) { - id = alloc_netid(net, peer, -1); - *alloc = true; - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; - } - return NETNSA_NSID_NOT_ASSIGNED; } -/* should be called with nsid_lock held */ -static int __peernet2id(struct net *net, struct net *peer) -{ - bool no = false; - - return __peernet2id_alloc(net, peer, &no); -} - static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will @@ -252,26 +232,37 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, */ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { - bool alloc = false, alive = false; int id; if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; + spin_lock_bh(&net->nsid_lock); - /* - * When peer is obtained from RCU lists, we may race with + id = __peernet2id(net, peer); + if (id >= 0) { + spin_unlock_bh(&net->nsid_lock); + return id; + } + + /* When peer is obtained from RCU lists, we may race with * its cleanup. Check whether it's alive, and this guarantees * we never hash a peer back to net->netns_ids, after it has * just been idr_remove()'d from there in cleanup_net(). */ - if (maybe_get_net(peer)) - alive = alloc = true; - id = __peernet2id_alloc(net, peer, &alloc); + if (!maybe_get_net(peer)) { + spin_unlock_bh(&net->nsid_lock); + return NETNSA_NSID_NOT_ASSIGNED; + } + + id = alloc_netid(net, peer, -1); spin_unlock_bh(&net->nsid_lock); - if (alloc && id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); - if (alive) - put_net(peer); + + put_net(peer); + if (id < 0) + return NETNSA_NSID_NOT_ASSIGNED; + + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); + return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); -- cgit v1.2.3 From 2dce224f469f060b9998a5a869151ef83c08ce77 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 13 Jan 2020 22:39:22 +0100 Subject: netns: protect netns ID lookups with RCU __peernet2id() can be protected by RCU as it only calls idr_for_each(), which is RCU-safe, and never modifies the nsid table. rtnl_net_dumpid() can also do lockless lookups. It does two nested idr_for_each() calls on nsid tables (one direct call and one indirect call because of rtnl_net_dumpid_one() calling __peernet2id()). The netnsid tables are never updated. Therefore it is safe to not take the nsid_lock and run within an RCU-critical section instead. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/core/net_namespace.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 05e07d24b45b..e7a5ff4966c9 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -211,7 +211,7 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -/* Should be called with nsid_lock held. */ +/* Must be called from RCU-critical section or with nsid_lock held */ static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); @@ -272,9 +272,10 @@ int peernet2id(struct net *net, struct net *peer) { int id; - spin_lock_bh(&net->nsid_lock); + rcu_read_lock(); id = __peernet2id(net, peer); - spin_unlock_bh(&net->nsid_lock); + rcu_read_unlock(); + return id; } EXPORT_SYMBOL(peernet2id); @@ -941,6 +942,7 @@ struct rtnl_net_dump_cb { int s_idx; }; +/* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; @@ -1025,19 +1027,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) goto end; } - spin_lock_bh(&net_cb.tgt_net->nsid_lock); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net) && - !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); - err = -EAGAIN; - goto end; - } + rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); - if (net_cb.fillargs.add_ref && - !net_eq(net_cb.ref_net, net_cb.tgt_net)) - spin_unlock_bh(&net_cb.ref_net->nsid_lock); - spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + rcu_read_unlock(); cb->args[0] = net_cb.idx; end: -- cgit v1.2.3 From 8d7e5dee972f1cde2ba96c621f1541fa36e7d4f4 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 13 Jan 2020 22:39:23 +0100 Subject: netns: don't disable BHs when locking "nsid_lock" When peernet2id() had to lock "nsid_lock" before iterating through the nsid table, we had to disable BHs, because VXLAN can call peernet2id() from the xmit path: vxlan_xmit() -> vxlan_fdb_miss() -> vxlan_fdb_notify() -> __vxlan_fdb_notify() -> vxlan_fdb_info() -> peernet2id(). Now that peernet2id() uses RCU protection, "nsid_lock" isn't used in BH context anymore. Therefore, we can safely use plain spin_lock()/spin_unlock() and let BHs run when holding "nsid_lock". Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/core/net_namespace.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e7a5ff4966c9..6412c1fbfcb5 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -237,10 +237,10 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return id; } @@ -250,12 +250,12 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); put_net(peer); if (id < 0) @@ -520,20 +520,20 @@ static void unhash_nsid(struct net *net, struct net *last) for_each_net(tmp) { int id; - spin_lock_bh(&tmp->nsid_lock); + spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); + spin_unlock(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -746,9 +746,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, @@ -757,7 +757,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, } err = alloc_netid(net, peer, nsid); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); -- cgit v1.2.3 From 0a29275b6300f39f78a87f2038bbfe5bdbaeca47 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 10 Jan 2020 09:04:37 +0800 Subject: bpf: Return -EBADRQC for invalid map type in __bpf_tx_xdp_map A negative value should be returned if map->map_type is invalid although that is impossible now, but if we run into such situation in future, then xdpbuff could be leaked. Daniel Borkmann suggested: -EBADRQC should be returned to stay consistent with generic XDP for the tracepoint output and not to be confused with -EOPNOTSUPP from other locations like dev_map_enqueue() when ndo_xdp_xmit is missing and such. Suggested-by: Daniel Borkmann Signed-off-by: Li RongQing Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1578618277-18085-1-git-send-email-lirongqing@baidu.com --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a702761ef369..d6f58dc4e1c4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3522,7 +3522,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_XSKMAP: return __xsk_map_redirect(fwd, xdp); default: - break; + return -EBADRQC; } return 0; } -- cgit v1.2.3 From 1a186c14cedbf3364634f2dfa9302ed31b8fab19 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:27 -0500 Subject: net: udp: use skb_list_walk_safe helper for gso segments This is a straight-forward conversion case for the new function, iterating over the return value from udp_rcv_segment, which actually is a wrapper around skb_gso_segment. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 +-- net/ipv6/udp.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 93a355b6b092..208da0917469 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2104,8 +2104,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9fec580c968e..5dc439a391fe 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -690,8 +690,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, false); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udpv6_queue_rcv_one_skb(sk, skb); -- cgit v1.2.3 From c3b18e0d925439043b5887c9a6129ff2ffaf58b0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:28 -0500 Subject: net: xfrm: use skb_list_walk_safe helper for gso segments This is converts xfrm segment iteration to use the new function, keeping the flow of the existing code as intact as possible. One case is very straight-forward, whereas the other case has some more subtle code that likes to peak at ->next and relink skbs. By keeping the variables the same as before, we can upgrade this code with minimal surgery required. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/xfrm/xfrm_device.c | 15 ++++----------- net/xfrm/xfrm_output.c | 9 +++------ 2 files changed, 7 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 189ef15acbbc..50f567a88f45 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -78,7 +78,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int err; unsigned long flags; struct xfrm_state *x; - struct sk_buff *skb2; + struct sk_buff *skb2, *nskb; struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); @@ -148,11 +148,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - skb2 = skb; - - do { - struct sk_buff *nskb = skb2->next; - + skb_list_walk_safe(skb, skb2, nskb) { esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); @@ -176,14 +172,11 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (!skb) return NULL; - goto skip_push; + continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); - -skip_push: - skb2 = nskb; - } while (skb2); + } return skb; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index b1db55b50ba1..fafc7aba705f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -533,7 +533,7 @@ static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct sk_buff *segs; + struct sk_buff *segs, *nskb; BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); @@ -544,8 +544,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb if (segs == NULL) return -EINVAL; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -555,9 +554,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb kfree_skb_list(nskb); return err; } - - segs = nskb; - } while (segs); + } return 0; } -- cgit v1.2.3 From 2cec4448db38758832c2edad439f99584bb8fa0d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:29 -0500 Subject: net: openvswitch: use skb_list_walk_safe helper for gso segments This is a straight-forward conversion case for the new function, keeping the flow of the existing code as intact as possible. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e3a37d22539c..659c2a790fe7 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -321,8 +321,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } /* Queue all of the segments. */ - skb = segs; - do { + skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -330,17 +329,15 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (err) break; - } while ((skb = skb->next)); + } /* Free all of the segments. */ - skb = segs; - do { - nskb = skb->next; + skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); - } while ((skb = nskb)); + } return err; } -- cgit v1.2.3 From b950d8a5b300ceeb2e530291cddc6b3e547291f0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:30 -0500 Subject: net: sched: use skb_list_walk_safe helper for gso segments This is a straight-forward conversion case for the new function, keeping the flow of the existing code as intact as possible. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 4 +--- net/sched/sch_tbf.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 90ef7cc79b69..1496e87cd07b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1682,8 +1682,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); @@ -1696,7 +1695,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, slen += segs->len; q->buffer_used += segs->truesize; b->packets++; - segs = nskb; } /* stats */ diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 5f72f3f916a5..2cd94973795c 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -155,8 +155,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); nb = 0; - while (segs) { - nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; @@ -167,7 +166,6 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, } else { nb++; } - segs = nskb; } sch->q.qlen += nb; if (nb > 1) -- cgit v1.2.3 From 88bebdf5b26fe9fdeb36f2152978a3bb478b0d4d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:31 -0500 Subject: net: ipv4: use skb_list_walk_safe helper for gso segments This is a straight-forward conversion case for the new function, keeping the flow of the existing code as intact as possible. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 14db1e0b8a6e..d84819893db9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { + struct sk_buff *segs, *nskb; netdev_features_t features; - struct sk_buff *segs; int ret = 0; /* common case: seglen is <= mtu @@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, consume_skb(skb); - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, if (err && ret == 0) ret = err; - segs = nskb; - } while (segs); + } return ret; } -- cgit v1.2.3 From 2670ee77c99c76f46c061caf65daa2e2bfb5dcc5 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:32 -0500 Subject: net: netfilter: use skb_list_walk_safe helper for gso segments This is a straight-forward conversion case for the new function, keeping the flow of the existing code as intact as possible. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/netfilter/nfnetlink_queue.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index feabdfb22920..76535fd9278c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -778,7 +778,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; + struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -815,8 +815,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -824,8 +823,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) queued++; else kfree_skb(segs); - segs = nskb; - } while (segs); + } if (queued) { if (err) /* some segments are already queued */ -- cgit v1.2.3 From 9f3ef3d7026fe0aef5c2d649e21b8c71c17d8fc2 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 13 Jan 2020 18:42:33 -0500 Subject: net: mac80211: use skb_list_walk_safe helper for gso segments This is a conversion case for the new function, keeping the flow of the existing code as intact as possible. We also switch over to using skb_mark_not_on_list instead of a null write to skb->next. Finally, this code appeared to have a memory leak in the case where header building fails before the last gso segment. In that case, the remaining segments are not freed. So this commit also adds the proper kfree_skb_list call for the remainder of the skbs. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/mac80211/tx.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a8a7306a1f56..4bd1faf4f779 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3949,18 +3949,15 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, } } - next = skb; - while (next) { - skb = next; - next = skb->next; - - skb->prev = NULL; - skb->next = NULL; + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags); - if (IS_ERR(skb)) + if (IS_ERR(skb)) { + kfree_skb_list(next); goto out; + } ieee80211_tx_stats(dev, skb->len); -- cgit v1.2.3 From 7786a1af2a6bceb07860ec720e74714004438834 Mon Sep 17 00:00:00 2001 From: Niu Xilei Date: Tue, 14 Jan 2020 11:12:29 +0800 Subject: pktgen: Allow configuration of IPv6 source address range Pktgen can use only one IPv6 source address from output device or src6 command setting. In pressure test we need create lots of sessions more than 65535. So add src6_min and src6_max command to set the range. Signed-off-by: Niu Xilei Changes since v3: - function set_src_in6_addr use static instead of static inline - precompute min_in6_l,min_in6_h,max_in6_h,max_in6_l in setup time Changes since v2: - reword subject line Changes since v1: - only create IPv6 source address over least significant 64 bit range Signed-off-by: David S. Miller --- net/core/pktgen.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 294bfcf0ce0e..890be1b4877e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -323,6 +323,10 @@ struct pktgen_dev { struct in6_addr max_in6_daddr; struct in6_addr min_in6_saddr; struct in6_addr max_in6_saddr; + u64 max_in6_h; + u64 max_in6_l; + u64 min_in6_h; + u64 min_in6_l; /* If we're doing ranges, random or incremental, then this * defines the min/max for those ranges. @@ -1355,6 +1359,59 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } + if (!strcmp(name, "src6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->min_in6_saddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_saddr); + + memcpy(&pkt_dev->min_in6_h, pkt_dev->min_in6_saddr.s6_addr, 8); + memcpy(&pkt_dev->min_in6_l, pkt_dev->min_in6_saddr.s6_addr + 8, 8); + pkt_dev->min_in6_h = be64_to_cpu(pkt_dev->min_in6_h); + pkt_dev->min_in6_l = be64_to_cpu(pkt_dev->min_in6_l); + + pkt_dev->cur_in6_saddr = pkt_dev->min_in6_saddr; + if (debug) + pr_debug("src6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6_min=%s", buf); + return count; + } + if (!strcmp(name, "src6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->max_in6_saddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_saddr); + + memcpy(&pkt_dev->max_in6_h, pkt_dev->max_in6_saddr.s6_addr, 8); + memcpy(&pkt_dev->max_in6_l, pkt_dev->max_in6_saddr.s6_addr + 8, 8); + pkt_dev->max_in6_h = be64_to_cpu(pkt_dev->max_in6_h); + pkt_dev->max_in6_l = be64_to_cpu(pkt_dev->max_in6_l); + + if (debug) + pr_debug("src6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6_max=%s", buf); + return count; + } if (!strcmp(name, "src6")) { len = strn_len(&user_buffer[i], sizeof(buf) - 1); if (len < 0) @@ -2286,6 +2343,45 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues; } +/* generate ipv6 source addr */ +static void set_src_in6_addr(struct pktgen_dev *pkt_dev) +{ + u64 min6, max6, rand, i; + struct in6_addr addr6; + __be64 addr_l, *t; + + min6 = pkt_dev->min_in6_l; + max6 = pkt_dev->max_in6_l; + + /* only generate source address in least significant 64 bits range + * most significant 64 bits must be equal + */ + if (pkt_dev->max_in6_h != pkt_dev->min_in6_h || min6 >= max6) + return; + + addr6 = pkt_dev->min_in6_saddr; + t = (__be64 *)addr6.s6_addr + 1; + + if (pkt_dev->flags & F_IPSRC_RND) { + do { + prandom_bytes(&rand, sizeof(rand)); + rand = rand % (max6 - min6) + min6; + addr_l = cpu_to_be64(rand); + memcpy(t, &addr_l, 8); + } while (ipv6_addr_loopback(&addr6) || + ipv6_addr_v4mapped(&addr6) || + ipv6_addr_is_multicast(&addr6)); + } else { + addr6 = pkt_dev->cur_in6_saddr; + i = be64_to_cpu(*t); + if (++i > max6) + i = min6; + addr_l = cpu_to_be64(i); + memcpy(t, &addr_l, 8); + } + pkt_dev->cur_in6_saddr = addr6; +} + /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2454,6 +2550,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } } else { /* IPV6 * */ + set_src_in6_addr(pkt_dev); + if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) { int i; -- cgit v1.2.3 From cb6530b99fafea6c0636c4640bd21301d12cdbc9 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 13 Jan 2020 23:56:59 -0800 Subject: net: qrtr: Move resume-tx transmission to recvmsg The confirm-rx bit is used to implement a per port flow control, in order to make sure that no messages are dropped due to resource exhaustion. Move the resume-tx transmission to recvmsg to only confirm messages as they are consumed by the application. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 60 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 3d24d45be5f4..60f73b44c889 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -362,22 +362,11 @@ static void qrtr_port_put(struct qrtr_sock *ipc); static void qrtr_node_rx_work(struct work_struct *work) { struct qrtr_node *node = container_of(work, struct qrtr_node, work); - struct qrtr_ctrl_pkt *pkt; - struct sockaddr_qrtr dst; - struct sockaddr_qrtr src; struct sk_buff *skb; while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { struct qrtr_sock *ipc; - struct qrtr_cb *cb; - int confirm; - - cb = (struct qrtr_cb *)skb->cb; - src.sq_node = cb->src_node; - src.sq_port = cb->src_port; - dst.sq_node = cb->dst_node; - dst.sq_port = cb->dst_port; - confirm = !!cb->confirm_rx; + struct qrtr_cb *cb = (struct qrtr_cb *)skb->cb; qrtr_node_assign(node, cb->src_node); @@ -390,20 +379,6 @@ static void qrtr_node_rx_work(struct work_struct *work) qrtr_port_put(ipc); } - - if (confirm) { - skb = qrtr_alloc_ctrl_packet(&pkt); - if (!skb) - break; - - pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); - pkt->client.node = cpu_to_le32(dst.sq_node); - pkt->client.port = cpu_to_le32(dst.sq_port); - - if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, - &dst, &src)) - break; - } } } @@ -816,6 +791,34 @@ out_node: return rc; } +static int qrtr_send_resume_tx(struct qrtr_cb *cb) +{ + struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; + struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; + struct qrtr_ctrl_pkt *pkt; + struct qrtr_node *node; + struct sk_buff *skb; + int ret; + + node = qrtr_node_lookup(remote.sq_node); + if (!node) + return -EINVAL; + + skb = qrtr_alloc_ctrl_packet(&pkt); + if (!skb) + return -ENOMEM; + + pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); + pkt->client.node = cpu_to_le32(cb->dst_node); + pkt->client.port = cpu_to_le32(cb->dst_port); + + ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); + + qrtr_node_release(node); + + return ret; +} + static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -838,6 +841,7 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, release_sock(sk); return rc; } + cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { @@ -851,7 +855,6 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, rc = copied; if (addr) { - cb = (struct qrtr_cb *)skb->cb; addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; @@ -859,6 +862,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, } out: + if (cb->confirm_rx) + qrtr_send_resume_tx(cb); + skb_free_datagram(sk, skb); release_sock(sk); -- cgit v1.2.3 From 5fdeb0d372ab33b4175043a2a4a1730239a217f1 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 13 Jan 2020 23:57:00 -0800 Subject: net: qrtr: Implement outgoing flow control In order to prevent overconsumption of resources on the remote side QRTR implements a flow control mechanism. The mechanism works by the sender keeping track of the number of outstanding unconfirmed messages that has been transmitted to a particular node/port pair. Upon count reaching a low watermark (L) the confirm_rx bit is set in the outgoing message and when the count reaching a high watermark (H) transmission will be blocked upon the reception of a resume_tx message from the remote, that resets the counter to 0. This guarantees that there will be at most 2H - L messages in flight. Values chosen for L and H are 5 and 10 respectively. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 187 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 60f73b44c889..2b7f462476c0 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -8,6 +8,7 @@ #include #include /* For TIOCINQ/OUTQ */ #include +#include #include @@ -113,6 +114,8 @@ static DEFINE_MUTEX(qrtr_port_lock); * @ep: endpoint * @ref: reference count for node * @nid: node id + * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port + * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue * @work: scheduled work struct for recv work * @item: list item for broadcast list @@ -123,11 +126,29 @@ struct qrtr_node { struct kref ref; unsigned int nid; + struct radix_tree_root qrtr_tx_flow; + struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ + struct sk_buff_head rx_queue; struct work_struct work; struct list_head item; }; +/** + * struct qrtr_tx_flow - tx flow control + * @resume_tx: waiters for a resume tx from the remote + * @pending: number of waiting senders + * @tx_failed: indicates that a message with confirm_rx flag was lost + */ +struct qrtr_tx_flow { + struct wait_queue_head resume_tx; + int pending; + int tx_failed; +}; + +#define QRTR_TX_FLOW_HIGH 10 +#define QRTR_TX_FLOW_LOW 5 + static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); @@ -143,6 +164,8 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); + struct radix_tree_iter iter; + void __rcu **slot; if (node->nid != QRTR_EP_NID_AUTO) radix_tree_delete(&qrtr_nodes, node->nid); @@ -152,6 +175,12 @@ static void __qrtr_node_release(struct kref *kref) cancel_work_sync(&node->work); skb_queue_purge(&node->rx_queue); + + /* Free tx flow counters */ + radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); + kfree(*slot); + } kfree(node); } @@ -171,6 +200,126 @@ static void qrtr_node_release(struct qrtr_node *node) kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } +/** + * qrtr_tx_resume() - reset flow control counter + * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on + * @skb: resume_tx packet + */ +static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) +{ + struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; + u64 remote_node = le32_to_cpu(pkt->client.node); + u32 remote_port = le32_to_cpu(pkt->client.port); + struct qrtr_tx_flow *flow; + unsigned long key; + + key = remote_node << 32 | remote_port; + + rcu_read_lock(); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + rcu_read_unlock(); + if (flow) { + spin_lock(&flow->resume_tx.lock); + flow->pending = 0; + spin_unlock(&flow->resume_tx.lock); + wake_up_interruptible_all(&flow->resume_tx); + } + + consume_skb(skb); +} + +/** + * qrtr_tx_wait() - flow control for outgoing packets + * @node: qrtr_node that the packet is to be send to + * @dest_node: node id of the destination + * @dest_port: port number of the destination + * @type: type of message + * + * The flow control scheme is based around the low and high "watermarks". When + * the low watermark is passed the confirm_rx flag is set on the outgoing + * message, which will trigger the remote to send a control message of the type + * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit + * further transmision should be paused. + * + * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure + */ +static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, + int type) +{ + unsigned long key = (u64)dest_node << 32 | dest_port; + struct qrtr_tx_flow *flow; + int confirm_rx = 0; + int ret; + + /* Never set confirm_rx on non-data packets */ + if (type != QRTR_TYPE_DATA) + return 0; + + mutex_lock(&node->qrtr_tx_lock); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + if (!flow) { + flow = kzalloc(sizeof(*flow), GFP_KERNEL); + if (flow) { + init_waitqueue_head(&flow->resume_tx); + radix_tree_insert(&node->qrtr_tx_flow, key, flow); + } + } + mutex_unlock(&node->qrtr_tx_lock); + + /* Set confirm_rx if we where unable to find and allocate a flow */ + if (!flow) + return 1; + + spin_lock_irq(&flow->resume_tx.lock); + ret = wait_event_interruptible_locked_irq(flow->resume_tx, + flow->pending < QRTR_TX_FLOW_HIGH || + flow->tx_failed || + !node->ep); + if (ret < 0) { + confirm_rx = ret; + } else if (!node->ep) { + confirm_rx = -EPIPE; + } else if (flow->tx_failed) { + flow->tx_failed = 0; + confirm_rx = 1; + } else { + flow->pending++; + confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; + } + spin_unlock_irq(&flow->resume_tx.lock); + + return confirm_rx; +} + +/** + * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed + * @node: qrtr_node that the packet is to be send to + * @dest_node: node id of the destination + * @dest_port: port number of the destination + * + * Signal that the transmission of a message with confirm_rx flag failed. The + * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, + * at which point transmission would stall forever waiting for the resume TX + * message associated with the dropped confirm_rx message. + * Work around this by marking the flow as having a failed transmission and + * cause the next transmission attempt to be sent with the confirm_rx. + */ +static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, + int dest_port) +{ + unsigned long key = (u64)dest_node << 32 | dest_port; + struct qrtr_tx_flow *flow; + + rcu_read_lock(); + flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + rcu_read_unlock(); + if (flow) { + spin_lock_irq(&flow->resume_tx.lock); + flow->tx_failed = 1; + spin_unlock_irq(&flow->resume_tx.lock); + } +} + /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, @@ -179,6 +328,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc = -ENODEV; + int confirm_rx; + + confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); + if (confirm_rx < 0) { + kfree_skb(skb); + return confirm_rx; + } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); @@ -194,7 +350,7 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, } hdr->size = cpu_to_le32(len); - hdr->confirm_rx = 0; + hdr->confirm_rx = !!confirm_rx; skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); @@ -205,6 +361,11 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, kfree_skb(skb); mutex_unlock(&node->ep_lock); + /* Need to ensure that a subsequent message carries the otherwise lost + * confirm_rx flag if we dropped this one */ + if (rc && confirm_rx) + qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); + return rc; } @@ -311,7 +472,8 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (len != ALIGN(size, 4) + hdrlen) goto err; - if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA) + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && + cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); @@ -370,14 +532,18 @@ static void qrtr_node_rx_work(struct work_struct *work) qrtr_node_assign(node, cb->src_node); - ipc = qrtr_port_lookup(cb->dst_port); - if (!ipc) { - kfree_skb(skb); + if (cb->type == QRTR_TYPE_RESUME_TX) { + qrtr_tx_resume(node, skb); } else { - if (sock_queue_rcv_skb(&ipc->sk, skb)) + ipc = qrtr_port_lookup(cb->dst_port); + if (!ipc) { kfree_skb(skb); + } else { + if (sock_queue_rcv_skb(&ipc->sk, skb)) + kfree_skb(skb); - qrtr_port_put(ipc); + qrtr_port_put(ipc); + } } } } @@ -408,6 +574,9 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) node->nid = QRTR_EP_NID_AUTO; node->ep = ep; + INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); + mutex_init(&node->qrtr_tx_lock); + qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); @@ -428,8 +597,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; + struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; + struct qrtr_tx_flow *flow; struct sk_buff *skb; + void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; @@ -442,6 +614,14 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } + /* Wake up any transmitters waiting for resume-tx from the node */ + mutex_lock(&node->qrtr_tx_lock); + radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + flow = *slot; + wake_up_interruptible_all(&flow->resume_tx); + } + mutex_unlock(&node->qrtr_tx_lock); + qrtr_node_release(node); ep->node = NULL; } -- cgit v1.2.3 From 0a7e0d0ef05440db03c3199e84d228db943b237f Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 13 Jan 2020 23:57:01 -0800 Subject: net: qrtr: Migrate node lookup tree to spinlock Move operations on the qrtr_nodes radix tree under a separate spinlock and make the qrtr_nodes tree GFP_ATOMIC, to allow operation from atomic context in a subsequent patch. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 2b7f462476c0..ca802798e19e 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -8,6 +8,7 @@ #include #include /* For TIOCINQ/OUTQ */ #include +#include #include #include @@ -98,10 +99,11 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk) static unsigned int qrtr_local_nid = NUMA_NO_NODE; /* for node ids */ -static RADIX_TREE(qrtr_nodes, GFP_KERNEL); +static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); +static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); -/* lock for qrtr_nodes, qrtr_all_nodes and node reference */ +/* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ @@ -165,10 +167,13 @@ static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); struct radix_tree_iter iter; + unsigned long flags; void __rcu **slot; + spin_lock_irqsave(&qrtr_nodes_lock, flags); if (node->nid != QRTR_EP_NID_AUTO) radix_tree_delete(&qrtr_nodes, node->nid); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); @@ -376,11 +381,12 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; + unsigned long flags; - mutex_lock(&qrtr_node_lock); + spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); - mutex_unlock(&qrtr_node_lock); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); return node; } @@ -392,13 +398,15 @@ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { + unsigned long flags; + if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) return; - mutex_lock(&qrtr_node_lock); + spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); node->nid = nid; - mutex_unlock(&qrtr_node_lock); + spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** -- cgit v1.2.3 From f16a4b26f31f95dddb12cf3c2390906a735203ae Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 13 Jan 2020 23:57:02 -0800 Subject: net: qrtr: Make qrtr_port_lookup() use RCU The important part of qrtr_port_lookup() wrt synchronization is that the function returns a reference counted struct qrtr_sock, or fail. As such we need only to ensure that an decrement of the object's refcount happens inbetween the finding of the object in the idr and qrtr_port_lookup()'s own increment of the object. By using RCU and putting a synchronization point after we remove the mapping from the idr, but before it can be released we achieve this - with the benefit of not having to hold the mutex in qrtr_port_lookup(). Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index ca802798e19e..0fe08f060b8c 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -646,11 +646,11 @@ static struct qrtr_sock *qrtr_port_lookup(int port) if (port == QRTR_PORT_CTRL) port = 0; - mutex_lock(&qrtr_port_lock); + rcu_read_lock(); ipc = idr_find(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); - mutex_unlock(&qrtr_port_lock); + rcu_read_unlock(); return ipc; } @@ -692,6 +692,10 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) mutex_lock(&qrtr_port_lock); idr_remove(&qrtr_ports, port); mutex_unlock(&qrtr_port_lock); + + /* Ensure that if qrtr_port_lookup() did enter the RCU read section we + * wait for it to up increment the refcount */ + synchronize_rcu(); } /* Assign port number to socket. -- cgit v1.2.3 From e04df98adf7d7d946aa927822ccec24680501662 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 13 Jan 2020 23:57:03 -0800 Subject: net: qrtr: Remove receive worker Rather than enqueuing messages and scheduling a worker to deliver them to the individual sockets we can now, thanks to the previous work, move this directly into the endpoint callback. This saves us a context switch per incoming message and removes the possibility of an opportunistic suspend to happen between the message is coming from the endpoint until it ends up in the socket's receive buffer. Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 57 +++++++++++++++++---------------------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 0fe08f060b8c..5a8e42ad1504 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -119,7 +119,6 @@ static DEFINE_MUTEX(qrtr_port_lock); * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue - * @work: scheduled work struct for recv work * @item: list item for broadcast list */ struct qrtr_node { @@ -132,7 +131,6 @@ struct qrtr_node { struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ struct sk_buff_head rx_queue; - struct work_struct work; struct list_head item; }; @@ -157,6 +155,8 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); +static struct qrtr_sock *qrtr_port_lookup(int port); +static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * @@ -178,7 +178,6 @@ static void __qrtr_node_release(struct kref *kref) list_del(&node->item); mutex_unlock(&qrtr_node_lock); - cancel_work_sync(&node->work); skb_queue_purge(&node->rx_queue); /* Free tx flow counters */ @@ -422,6 +421,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; + struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; unsigned int size; @@ -486,8 +486,20 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) skb_put_data(skb, data + hdrlen, size); - skb_queue_tail(&node->rx_queue, skb); - schedule_work(&node->work); + qrtr_node_assign(node, cb->src_node); + + if (cb->type == QRTR_TYPE_RESUME_TX) { + qrtr_tx_resume(node, skb); + } else { + ipc = qrtr_port_lookup(cb->dst_port); + if (!ipc) + goto err; + + if (sock_queue_rcv_skb(&ipc->sk, skb)) + goto err; + + qrtr_port_put(ipc); + } return 0; @@ -522,40 +534,6 @@ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) return skb; } -static struct qrtr_sock *qrtr_port_lookup(int port); -static void qrtr_port_put(struct qrtr_sock *ipc); - -/* Handle and route a received packet. - * - * This will auto-reply with resume-tx packet as necessary. - */ -static void qrtr_node_rx_work(struct work_struct *work) -{ - struct qrtr_node *node = container_of(work, struct qrtr_node, work); - struct sk_buff *skb; - - while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { - struct qrtr_sock *ipc; - struct qrtr_cb *cb = (struct qrtr_cb *)skb->cb; - - qrtr_node_assign(node, cb->src_node); - - if (cb->type == QRTR_TYPE_RESUME_TX) { - qrtr_tx_resume(node, skb); - } else { - ipc = qrtr_port_lookup(cb->dst_port); - if (!ipc) { - kfree_skb(skb); - } else { - if (sock_queue_rcv_skb(&ipc->sk, skb)) - kfree_skb(skb); - - qrtr_port_put(ipc); - } - } - } -} - /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register @@ -575,7 +553,6 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) if (!node) return -ENOMEM; - INIT_WORK(&node->work, qrtr_node_rx_work); kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); -- cgit v1.2.3 From 6324d0fa03bf118e9682b6f3da437deaadb61924 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:09 +0200 Subject: ipv4: Replace route in list before notifying Subsequent patches will add an offload / trap indication to routes which will signal if the route is present in hardware or not. After programming the route to the hardware, drivers will have to ask the IPv4 code to set the flags by passing the route's key. In the case of route replace, the new route is notified before it is actually inserted into the FIB alias list. This can prevent simple drivers (e.g., netdevsim) that program the route to the hardware in the same context it is notified in from being able to set the flag. Solve this by first inserting the new route to the list and rollback the operation in case the route was vetoed. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b92a42433a7d..39f56d68ec19 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1221,23 +1221,26 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, - tb->tb_id, true) == fa) { + tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); - if (err) + if (err) { + hlist_replace_rcu(&new_fa->fa_list, + &fa->fa_list); goto out_free_new_fa; + } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - alias_free_mem_rcu(fa); fib_release_info(fi_drop); -- cgit v1.2.3 From 1e301fd04eaaa5b1e3c202450d86864e6714d783 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:10 +0200 Subject: ipv4: Encapsulate function arguments in a struct fib_dump_info() is used to prepare RTM_{NEW,DEL}ROUTE netlink messages using the passed arguments. Currently, the function takes 11 arguments, 6 of which are attributes of the route being dumped (e.g., prefix, TOS). The next patch will need the function to also dump to user space an indication if the route is present in hardware or not. Instead of passing yet another argument, change the function to take a struct containing the different route attributes. v2: * Name last argument of fib_dump_info() * Move 'struct fib_rt_info' to include/net/ip_fib.h so that it could later be passed to fib_alias_hw_flags_set() Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 9 +++++++++ net/ipv4/fib_lookup.h | 5 ++--- net/ipv4/fib_semantics.c | 26 ++++++++++++++++---------- net/ipv4/fib_trie.c | 14 +++++++++----- net/ipv4/route.c | 12 +++++++++--- 5 files changed, 45 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b9cba41c6d4f..0c071c820e33 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -204,6 +204,15 @@ __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) +struct fib_rt_info { + struct fib_info *fi; + u32 tb_id; + __be32 dst; + int dst_len; + u8 tos; + u8 type; +}; + struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a68b5e21ec51..a4b829358bfa 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -35,9 +35,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, - u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, - unsigned int); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f1888c683426..3ed1349be428 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { + struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; @@ -512,9 +513,13 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, if (!skb) goto errout; - err = fib_dump_info(skb, info->portid, seq, event, tb_id, - fa->fa_type, key, dst_len, - fa->fa_tos, fa->fa_info, nlm_flags); + fri.fi = fa->fa_info; + fri.tb_id = tb_id; + fri.dst = key; + fri.dst_len = dst_len; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1725,10 +1730,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, - struct fib_info *fi, unsigned int flags) + struct fib_rt_info *fri, unsigned int flags) { - unsigned int nhs = fib_info_num_path(fi); + unsigned int nhs = fib_info_num_path(fri->fi); + struct fib_info *fi = fri->fi; + u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1738,22 +1744,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = dst_len; + rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; - rtm->rtm_tos = tos; + rtm->rtm_tos = fri->tos; if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; - rtm->rtm_type = type; + rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_in_addr(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 39f56d68ec19..75af3f8ae50e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2194,14 +2194,18 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_routes) { if (!s_fa) { + struct fib_rt_info fri; + + fri.fi = fi; + fri.tb_id = tb->tb_id; + fri.dst = xkey; + fri.dst_len = KEYLENGTH - fa->fa_slen; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, fi, flags); + RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 87e979f2b74a..167a7357d12a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3223,16 +3223,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + struct fib_rt_info fri; + if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } + fri.fi = res.fi; + fri.tb_id = table_id; + fri.dst = res.prefix; + fri.dst_len = res.prefixlen; + fri.tos = fl4.flowi4_tos; + fri.type = rt->rt_type; err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWROUTE, table_id, - rt->rt_type, res.prefix, res.prefixlen, - fl4.flowi4_tos, res.fi, 0); + nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, -- cgit v1.2.3 From 90b93f1b31f86dcde2fa3c57f1ae33d28d87a1f8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:11 +0200 Subject: ipv4: Add "offload" and "trap" indications to routes When performing L3 offload, routes and nexthops are usually programmed into two different tables in the underlying device. Therefore, the fact that a nexthop resides in hardware does not necessarily mean that all the associated routes also reside in hardware and vice-versa. While the kernel can signal to user space the presence of a nexthop in hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag for routes. In addition, the fact that a route resides in hardware does not necessarily mean that the traffic is offloaded. For example, unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap packets to the CPU so that the kernel will be able to generate the appropriate ICMP error packet. This patch adds an "offload" and "trap" indications to IPv4 routes, so that users will have better visibility into the offload process. 'struct fib_alias' is extended with two new fields that indicate if the route resides in hardware or not and if it is offloading traffic from the kernel or trapping packets to it. Note that the new fields are added in the 6 bytes hole and therefore the struct still fits in a single cache line [1]. Capable drivers are expected to invoke fib_alias_hw_flags_set() with the route's key in order to set the flags. The indications are dumped to user space via a new flags (i.e., 'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the ancillary header. v2: * Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set() [1] struct fib_alias { struct hlist_node fa_list; /* 0 16 */ struct fib_info * fa_info; /* 16 8 */ u8 fa_tos; /* 24 1 */ u8 fa_type; /* 25 1 */ u8 fa_state; /* 26 1 */ u8 fa_slen; /* 27 1 */ u32 tb_id; /* 28 4 */ s16 fa_default; /* 32 2 */ u8 offload:1; /* 34: 0 1 */ u8 trap:1; /* 34: 1 1 */ u8 unused:6; /* 34: 2 1 */ /* XXX 5 bytes hole, try to pack */ struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */ /* size: 56, cachelines: 1, members: 12 */ /* sum members: 50, holes: 1, sum holes: 5 */ /* sum bitfield members: 8 bits (1 bytes) */ /* forced alignments: 1, forced holes: 1, sum forced holes: 5 */ /* last cacheline: 56 bytes */ } __attribute__((__aligned__(8))); Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 ++++ include/uapi/linux/rtnetlink.h | 2 ++ net/ipv4/fib_lookup.h | 3 +++ net/ipv4/fib_semantics.c | 7 ++++++ net/ipv4/fib_trie.c | 52 ++++++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 19 +++++++++++++++ 6 files changed, 87 insertions(+) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 0c071c820e33..6a1ae49809de 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -211,6 +211,9 @@ struct fib_rt_info { int dst_len; u8 tos; u8 type; + u8 offload:1, + trap:1, + unused:6; }; struct fib_entry_notifier_info { @@ -473,6 +476,7 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap, void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 1418a8362bb7..cd43321d20dd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -309,6 +309,8 @@ enum rt_scope_t { #define RTM_F_PREFIX 0x800 /* Prefix addresses */ #define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */ #define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ +#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ +#define RTM_F_TRAP 0x8000 /* route is trapping packets */ /* Reserved table identifiers */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a4b829358bfa..c092e9a55790 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -16,6 +16,9 @@ struct fib_alias { u8 fa_slen; u32 tb_id; s16 fa_default; + u8 offload:1, + trap:1, + unused:6; struct rcu_head rcu; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3ed1349be428..a803cdd9400a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -519,6 +519,8 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, fri.dst_len = dst_len; fri.tos = fa->fa_tos; fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -1801,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } + if (fri->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (fri->trap) + rtm->rtm_flags |= RTM_F_TRAP; + nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 75af3f8ae50e..6ce1f2bbffd0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1012,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, return NULL; } +static struct fib_alias * +fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) +{ + u8 slen = KEYLENGTH - fri->dst_len; + struct key_vector *l, *tp; + struct fib_table *tb; + struct fib_alias *fa; + struct trie *t; + + tb = fib_get_table(net, fri->tb_id); + if (!tb) + return NULL; + + t = (struct trie *)tb->tb_data; + l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); + if (!l) + return NULL; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && + fa->fa_tos == fri->tos && fa->fa_info == fri->fi && + fa->fa_type == fri->type) + return fa; + } + + return NULL; +} + +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) +{ + struct fib_alias *fa_match; + + rcu_read_lock(); + + fa_match = fib_find_matching_alias(net, fri); + if (!fa_match) + goto out; + + fa_match->offload = fri->offload; + fa_match->trap = fri->trap; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); + static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) @@ -1220,6 +1266,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); @@ -1278,6 +1326,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); @@ -2202,6 +2252,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, fri.dst_len = KEYLENGTH - fa->fa_slen; fri.tos = fa->fa_tos; fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 167a7357d12a..2010888e68ca 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3237,6 +3237,25 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fri.dst_len = res.prefixlen; fri.tos = fl4.flowi4_tos; fri.type = rt->rt_type; + fri.offload = 0; + fri.trap = 0; + if (res.fa_head) { + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { + u8 slen = 32 - fri.dst_len; + + if (fa->fa_slen == slen && + fa->tb_id == fri.tb_id && + fa->fa_tos == fri.tos && + fa->fa_info == res.fi && + fa->fa_type == fri.type) { + fri.offload = fa->offload; + fri.trap = fa->trap; + break; + } + } + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { -- cgit v1.2.3 From bb3c4ab93e44784c1e958bdbba7824bba40f23cd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 14 Jan 2020 13:23:12 +0200 Subject: ipv6: Add "offload" and "trap" indications to routes In a similar fashion to previous patch, add "offload" and "trap" indication to IPv6 routes. This is done by using two unused bits in 'struct fib6_info' to hold these indications. Capable drivers are expected to set these when processing the various in-kernel route notifications. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 11 ++++++++++- net/ipv6/route.c | 7 +++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index b579faea41e9..fd60a8ac02ee 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -192,7 +192,9 @@ struct fib6_info { dst_nopolicy:1, dst_host:1, fib6_destroying:1, - unused:3; + offload:1, + trap:1, + unused:1; struct rcu_head rcu; struct nexthop *nh; @@ -329,6 +331,13 @@ static inline void fib6_info_release(struct fib6_info *f6i) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } +static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload, + bool trap) +{ + f6i->offload = offload; + f6i->trap = trap; +} + enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0253b702afb7..4fbdc60b4e07 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5576,6 +5576,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, expires -= jiffies; } + if (!dst) { + if (rt->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (rt->trap) + rtm->rtm_flags |= RTM_F_TRAP; + } + if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; -- cgit v1.2.3 From 5a46facbbcd454985992b5109185329aebf82a02 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:07 +0200 Subject: net: bridge: vlan: add helpers to check for vlan id/range validity Add helpers to check if a vlan id or range are valid. The range helper must be called when range start or end are detected. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 13 +++---------- net/bridge/br_private.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 60136575aea4..14100e8653e6 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -568,17 +568,13 @@ static int br_process_vlan_info(struct net_bridge *br, bool *changed, struct netlink_ext_ack *extack) { - if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) + if (!br_vlan_valid_id(vinfo_curr->vid)) return -EINVAL; if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - /* check if we are already processing a range */ - if (*vinfo_last) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last)) return -EINVAL; *vinfo_last = vinfo_curr; - /* don't allow range of pvids */ - if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID) - return -EINVAL; return 0; } @@ -586,10 +582,7 @@ static int br_process_vlan_info(struct net_bridge *br, struct bridge_vlan_info tmp_vinfo; int v, err; - if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END)) - return -EINVAL; - - if (vinfo_curr->vid <= (*vinfo_last)->vid) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f540f3bdf294..dbc0089e2c1a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -507,6 +507,37 @@ static inline bool nbp_state_should_learn(const struct net_bridge_port *p) return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; } +static inline bool br_vlan_valid_id(u16 vid) +{ + return vid > 0 && vid < VLAN_VID_MASK; +} + +static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, + const struct bridge_vlan_info *last) +{ + /* pvid flag is not allowed in ranges */ + if (cur->flags & BRIDGE_VLAN_INFO_PVID) + return false; + + /* check for required range flags */ + if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END))) + return false; + + /* when cur is the range end, check if: + * - it has range start flag + * - range ids are invalid (end is equal to or before start) + */ + if (last) { + if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) + return false; + else if (cur->vid <= last->vid) + return false; + } + + return true; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { -- cgit v1.2.3 From 8f4cc940a149b9fe013a191d6d8dc87aee9a204f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:08 +0200 Subject: net: bridge: netlink: add extack error messages when processing vlans Add extack messages on vlan processing errors. We need to move the flags missing check after the "last" check since we may have "last" set but lack a range end flag in the next entry. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 6 +++--- net/bridge/br_private.h | 38 +++++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 14100e8653e6..40942cece51a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -568,11 +568,11 @@ static int br_process_vlan_info(struct net_bridge *br, bool *changed, struct netlink_ext_ack *extack) { - if (!br_vlan_valid_id(vinfo_curr->vid)) + if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - if (!br_vlan_valid_range(vinfo_curr, *vinfo_last)) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; return 0; @@ -582,7 +582,7 @@ static int br_process_vlan_info(struct net_bridge *br, struct bridge_vlan_info tmp_vinfo; int v, err; - if (!br_vlan_valid_range(vinfo_curr, *vinfo_last)) + if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index dbc0089e2c1a..a7dddc5d7790 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -507,32 +507,48 @@ static inline bool nbp_state_should_learn(const struct net_bridge_port *p) return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; } -static inline bool br_vlan_valid_id(u16 vid) +static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack) { - return vid > 0 && vid < VLAN_VID_MASK; + bool ret = vid > 0 && vid < VLAN_VID_MASK; + + if (!ret) + NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid"); + + return ret; } static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, - const struct bridge_vlan_info *last) + const struct bridge_vlan_info *last, + struct netlink_ext_ack *extack) { /* pvid flag is not allowed in ranges */ - if (cur->flags & BRIDGE_VLAN_INFO_PVID) - return false; - - /* check for required range flags */ - if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | - BRIDGE_VLAN_INFO_RANGE_END))) + if (cur->flags & BRIDGE_VLAN_INFO_PVID) { + NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range"); return false; + } /* when cur is the range end, check if: * - it has range start flag * - range ids are invalid (end is equal to or before start) */ if (last) { - if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) + if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one"); return false; - else if (cur->vid <= last->vid) + } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing"); return false; + } else if (cur->vid <= last->vid) { + NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); + return false; + } + } + + /* check for required range flags */ + if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END))) { + NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing"); + return false; } return true; -- cgit v1.2.3 From 8dcea187088bce5d2f1149294ad109f022653547 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:09 +0200 Subject: net: bridge: vlan: add rtm definitions and dump support This patch adds vlan rtm definitions: - NEWVLAN: to be used for creating vlans, setting options and notifications - DELVLAN: to be used for deleting vlans - GETVLAN: used for dumping vlan information Dumping vlans which can span multiple messages is added now with basic information (vid and flags). We use nlmsg_parse() to validate the header length in order to be able to extend the message with filtering attributes later. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 28 ++++++++ include/uapi/linux/rtnetlink.h | 7 ++ net/bridge/br_netlink.c | 2 + net/bridge/br_private.h | 14 ++++ net/bridge/br_vlan.c | 148 +++++++++++++++++++++++++++++++++++++++++ security/selinux/nlmsgtab.c | 5 +- 6 files changed, 203 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 4a58e3d7de46..4da04f77d9ee 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -165,6 +165,34 @@ struct bridge_stp_xstats { __u64 tx_tcn; }; +/* Bridge vlan RTM header */ +struct br_vlan_msg { + __u8 family; + __u8 reserved1; + __u16 reserved2; + __u32 ifindex; +}; + +/* Bridge vlan RTM attributes + * [BRIDGE_VLANDB_ENTRY] = { + * [BRIDGE_VLANDB_ENTRY_INFO] + * ... + * } + */ +enum { + BRIDGE_VLANDB_UNSPEC, + BRIDGE_VLANDB_ENTRY, + __BRIDGE_VLANDB_MAX, +}; +#define BRIDGE_VLANDB_MAX (__BRIDGE_VLANDB_MAX - 1) + +enum { + BRIDGE_VLANDB_ENTRY_UNSPEC, + BRIDGE_VLANDB_ENTRY_INFO, + __BRIDGE_VLANDB_ENTRY_MAX, +}; +#define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) + /* Bridge multicast database attributes * [MDBA_MDB] = { * [MDBA_MDB_ENTRY] = { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index cd43321d20dd..b333cff14071 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -171,6 +171,13 @@ enum { RTM_GETLINKPROP, #define RTM_GETLINKPROP RTM_GETLINKPROP + RTM_NEWVLAN = 112, +#define RTM_NEWNVLAN RTM_NEWVLAN + RTM_DELVLAN, +#define RTM_DELVLAN RTM_DELVLAN + RTM_GETVLAN, +#define RTM_GETVLAN RTM_GETVLAN + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 40942cece51a..75a7ecf95d7f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1657,6 +1657,7 @@ int __init br_netlink_init(void) int err; br_mdb_init(); + br_vlan_rtnl_init(); rtnl_af_register(&br_af_ops); err = rtnl_link_register(&br_link_ops); @@ -1674,6 +1675,7 @@ out_af: void br_netlink_fini(void) { br_mdb_uninit(); + br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a7dddc5d7790..1c00411ae938 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -958,6 +958,8 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); +void br_vlan_rtnl_init(void); +void br_vlan_rtnl_uninit(void); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -1009,6 +1011,10 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) return vg->pvid; } +static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) +{ + return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags; +} #else static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, @@ -1152,6 +1158,14 @@ static inline int br_vlan_bridge_event(struct net_device *dev, { return 0; } + +static inline void br_vlan_rtnl_init(void) +{ +} + +static inline void br_vlan_rtnl_uninit(void) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index bb98984cd27d..5f2ac4f244f5 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1505,3 +1505,151 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) break; } } + +static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 flags) +{ + struct bridge_vlan_info info; + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); + if (!nest) + return false; + + memset(&info, 0, sizeof(info)); + info.vid = vid; + if (flags & BRIDGE_VLAN_INFO_UNTAGGED) + info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; + if (flags & BRIDGE_VLAN_INFO_PVID) + info.flags |= BRIDGE_VLAN_INFO_PVID; + + if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) + goto out_err; + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + +static int br_vlan_dump_dev(const struct net_device *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net_bridge_vlan_group *vg; + int idx = 0, s_idx = cb->args[1]; + struct nlmsghdr *nlh = NULL; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + struct br_vlan_msg *bvm; + struct net_bridge *br; + int err = 0; + u16 pvid; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) + return -EINVAL; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group_rcu(br); + p = NULL; + } else { + p = br_port_get_rcu(dev); + if (WARN_ON(!p)) + return -EINVAL; + vg = nbp_vlan_group_rcu(p); + br = p->br; + } + + if (!vg) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = PF_BRIDGE; + bvm->ifindex = dev->ifindex; + pvid = br_get_pvid(vg); + + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + if (idx < s_idx) + goto skip; + if (!br_vlan_fill_vids(skb, v->vid, br_vlan_flags(v, pvid))) { + err = -EMSGSIZE; + break; + } +skip: + idx++; + } + if (err) + cb->args[1] = idx; + else + cb->args[1] = 0; + nlmsg_end(skb, nlh); + + return err; +} + +static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx = 0, err = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + struct br_vlan_msg *bvm; + struct net_device *dev; + + err = nlmsg_parse(cb->nlh, sizeof(*bvm), NULL, 0, NULL, cb->extack); + if (err < 0) + return err; + + bvm = nlmsg_data(cb->nlh); + + rcu_read_lock(); + if (bvm->ifindex) { + dev = dev_get_by_index_rcu(net, bvm->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + err = br_vlan_dump_dev(dev, skb, cb); + if (err && err != -EMSGSIZE) + goto out_err; + } else { + for_each_netdev_rcu(net, dev) { + if (idx < s_idx) + goto skip; + + err = br_vlan_dump_dev(dev, skb, cb); + if (err == -EMSGSIZE) + break; +skip: + idx++; + } + } + cb->args[0] = idx; + rcu_read_unlock(); + + return skb->len; + +out_err: + rcu_read_unlock(); + + return err; +} + +void br_vlan_rtnl_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, + br_vlan_rtm_dump, 0); +} + +void br_vlan_rtnl_uninit(void) +{ + rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); +} diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index c97fdae8f71b..b69231918686 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -85,6 +85,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -168,7 +171,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWLINKPROP + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWVLAN + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From f26b296585dca043d78f13ef67cb17fb200d5e57 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:10 +0200 Subject: net: bridge: vlan: add new rtm message support Add initial RTM_NEWVLAN support which can only create vlans, operating similar to the current br_afspec(). We will use it later to also change per-vlan options. Old-style (flag-based) vlan ranges are not allowed when using RTM messages, we will introduce vlan ranges later via a new nested attribute which would allow us to have all the information about a range encapsulated into a single nl attribute. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 12 +++--- net/bridge/br_private.h | 6 +++ net/bridge/br_vlan.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 75a7ecf95d7f..b3da4f46dc64 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -561,12 +561,12 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, return err; } -static int br_process_vlan_info(struct net_bridge *br, - struct net_bridge_port *p, int cmd, - struct bridge_vlan_info *vinfo_curr, - struct bridge_vlan_info **vinfo_last, - bool *changed, - struct netlink_ext_ack *extack) +int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last, + bool *changed, + struct netlink_ext_ack *extack) { if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1c00411ae938..ee3871dea68f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1237,6 +1237,12 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); +int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last, + bool *changed, + struct netlink_ext_ack *extack); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 5f2ac4f244f5..6da0210b01eb 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1643,13 +1643,124 @@ out_err: return err; } +static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { + [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct bridge_vlan_info) }, +}; + +static int br_vlan_rtm_process_one(struct net_device *dev, + const struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct bridge_vlan_info *vinfo, *vinfo_last = NULL; + struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p = NULL; + int err = 0, cmdmap = 0; + struct net_bridge *br; + bool changed = false; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + } else { + p = br_port_get_rtnl(dev); + if (WARN_ON(!p)) + return -ENODEV; + br = p->br; + vg = nbp_vlan_group(p); + } + + if (WARN_ON(!vg)) + return -ENODEV; + + err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, + br_vlan_db_policy, extack); + if (err) + return err; + + if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { + NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); + return -EINVAL; + } + + vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); + if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | + BRIDGE_VLAN_INFO_RANGE_END)) { + NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); + return -EINVAL; + } + if (!br_vlan_valid_id(vinfo->vid, extack)) + return -EINVAL; + + switch (cmd) { + case RTM_NEWVLAN: + cmdmap = RTM_SETLINK; + break; + } + + err = br_process_vlan_info(br, p, cmdmap, vinfo, &vinfo_last, &changed, + extack); + if (changed) + br_ifinfo_notify(cmdmap, br, p); + + return err; +} + +static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct br_vlan_msg *bvm; + struct net_device *dev; + struct nlattr *attr; + int err, vlans = 0; + int rem; + + /* this should validate the header and check for remaining bytes */ + err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, + extack); + if (err < 0) + return err; + + bvm = nlmsg_data(nlh); + dev = __dev_get_by_index(net, bvm->ifindex); + if (!dev) + return -ENODEV; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { + NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); + return -EINVAL; + } + + nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { + if (nla_type(attr) != BRIDGE_VLANDB_ENTRY) + continue; + + vlans++; + err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, + extack); + if (err) + break; + } + if (!vlans) { + NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); + err = -EINVAL; + } + + return err; +} + void br_vlan_rtnl_init(void) { rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, + br_vlan_rtm_process, NULL, 0); } void br_vlan_rtnl_uninit(void) { rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); + rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); } -- cgit v1.2.3 From adb3ce9bcb0f565b21e0ce0c75223c6854670b80 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:11 +0200 Subject: net: bridge: vlan: add del rtm message support Adding RTM_DELVLAN support similar to RTM_NEWVLAN is simple, just need to map DELVLAN to DELLINK and register the handler. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6da0210b01eb..89d5fa75c575 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1697,6 +1697,9 @@ static int br_vlan_rtm_process_one(struct net_device *dev, case RTM_NEWVLAN: cmdmap = RTM_SETLINK; break; + case RTM_DELVLAN: + cmdmap = RTM_DELLINK; + break; } err = br_process_vlan_info(br, p, cmdmap, vinfo, &vinfo_last, &changed, @@ -1757,10 +1760,13 @@ void br_vlan_rtnl_init(void) br_vlan_rtm_dump, 0); rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, + br_vlan_rtm_process, NULL, 0); } void br_vlan_rtnl_uninit(void) { rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); + rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); } -- cgit v1.2.3 From 0ab558795184f87b532363e262357160d25f5d64 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:12 +0200 Subject: net: bridge: vlan: add rtm range support Add a new vlandb nl attribute - BRIDGE_VLANDB_ENTRY_RANGE which causes RTM_NEWVLAN/DELVAN to act on a range. Dumps now automatically compress similar vlans into ranges. This will be also used when per-vlan options are introduced and vlans' options match, they will be put into a single range which is encapsulated in one netlink attribute. We need to run similar checks as br_process_vlan_info() does because these ranges will be used for options setting and they'll be able to skip br_process_vlan_info(). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_vlan.c | 86 +++++++++++++++++++++++++++++++++++------- 2 files changed, 73 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 4da04f77d9ee..ac38f0b674b8 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -189,6 +189,7 @@ enum { enum { BRIDGE_VLANDB_ENTRY_UNSPEC, BRIDGE_VLANDB_ENTRY_INFO, + BRIDGE_VLANDB_ENTRY_RANGE, __BRIDGE_VLANDB_ENTRY_MAX, }; #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 89d5fa75c575..9d64a86f2cbd 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1506,7 +1506,8 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) } } -static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 flags) +static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, + u16 flags) { struct bridge_vlan_info info; struct nlattr *nest; @@ -1525,6 +1526,11 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 flags) if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) goto out_err; + if (vid_range && vid < vid_range && + !(flags & BRIDGE_VLAN_INFO_PVID) && + nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) + goto out_err; + nla_nest_end(skb, nest); return true; @@ -1534,14 +1540,22 @@ out_err: return false; } +/* check if v_curr can enter a range ending in range_end */ +static bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return v_curr->vid - range_end->vid == 1 && + range_end->flags == v_curr->flags; +} + static int br_vlan_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { + struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; struct nlmsghdr *nlh = NULL; - struct net_bridge_vlan *v; struct net_bridge_port *p; struct br_vlan_msg *bvm; struct net_bridge *br; @@ -1576,22 +1590,49 @@ static int br_vlan_dump_dev(const struct net_device *dev, bvm->ifindex = dev->ifindex; pvid = br_get_pvid(vg); + /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - if (idx < s_idx) - goto skip; - if (!br_vlan_fill_vids(skb, v->vid, br_vlan_flags(v, pvid))) { - err = -EMSGSIZE; - break; + if (idx < s_idx) { + idx++; + continue; } -skip: - idx++; + + if (!range_start) { + range_start = v; + range_end = v; + continue; + } + + if (v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { + u16 flags = br_vlan_flags(range_start, pvid); + + if (!br_vlan_fill_vids(skb, range_start->vid, + range_end->vid, flags)) { + err = -EMSGSIZE; + break; + } + /* advance number of filled vlans */ + idx += range_end->vid - range_start->vid + 1; + + range_start = v; + } + range_end = v; } - if (err) - cb->args[1] = idx; - else - cb->args[1] = 0; + + /* err will be 0 and range_start will be set in 3 cases here: + * - first vlan (range_start == range_end) + * - last vlan (range_start == range_end, not in range) + * - last vlan range (range_start != range_end, in range) + */ + if (!err && range_start && + !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, + br_vlan_flags(range_start, pvid))) + err = -EMSGSIZE; + + cb->args[1] = err ? idx : 0; + nlmsg_end(skb, nlh); return err; @@ -1646,13 +1687,14 @@ out_err: static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN, .len = sizeof(struct bridge_vlan_info) }, + [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { - struct bridge_vlan_info *vinfo, *vinfo_last = NULL; + struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -1683,6 +1725,7 @@ static int br_vlan_rtm_process_one(struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); return -EINVAL; } + memset(&vrange_end, 0, sizeof(vrange_end)); vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | @@ -1693,6 +1736,21 @@ static int br_vlan_rtm_process_one(struct net_device *dev, if (!br_vlan_valid_id(vinfo->vid, extack)) return -EINVAL; + if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { + vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); + /* validate user-provided flags without RANGE_BEGIN */ + vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; + vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + + /* vinfo_last is the range start, vinfo the range end */ + vinfo_last = vinfo; + vinfo = &vrange_end; + + if (!br_vlan_valid_id(vinfo->vid, extack) || + !br_vlan_valid_range(vinfo, vinfo_last, extack)) + return -EINVAL; + } + switch (cmd) { case RTM_NEWVLAN: cmdmap = RTM_SETLINK; -- cgit v1.2.3 From cf5bddb95cbe5340e486e84d46cf2f0bb324078c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:13 +0200 Subject: net: bridge: vlan: add rtnetlink group and notify support Add a new rtnetlink group for bridge vlan notifications - RTNLGRP_BRVLAN and add support for sending vlan notifications (both single and ranges). No functional changes intended, the notification support will be used by later patches. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ net/bridge/br_private.h | 11 ++++++ net/bridge/br_vlan.c | 79 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index b333cff14071..4a8c5b745157 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -730,6 +730,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R RTNLGRP_NEXTHOP, #define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ee3871dea68f..ba162c8197da 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -960,6 +960,10 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); void br_vlan_rtnl_init(void); void br_vlan_rtnl_uninit(void); +void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -1166,6 +1170,13 @@ static inline void br_vlan_rtnl_init(void) static inline void br_vlan_rtnl_uninit(void) { } + +static inline void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 9d64a86f2cbd..5d52a2604547 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1540,6 +1540,85 @@ out_err: return false; } +static size_t rtnl_vlan_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + + nla_total_size(sizeof(struct bridge_vlan_info)); /* BRIDGE_VLANDB_ENTRY_INFO */ +} + +void br_vlan_notify(const struct net_bridge *br, + const struct net_bridge_port *p, + u16 vid, u16 vid_range, + int cmd) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct br_vlan_msg *bvm; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err = -ENOBUFS; + struct net *net; + u16 flags = 0; + int ifindex; + + /* right now notifications are done only with rtnl held */ + ASSERT_RTNL(); + + if (p) { + ifindex = p->dev->ifindex; + vg = nbp_vlan_group(p); + net = dev_net(p->dev); + } else { + ifindex = br->dev->ifindex; + vg = br_vlan_group(br); + net = dev_net(br->dev); + } + + skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); + if (!nlh) + goto out_err; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = AF_BRIDGE; + bvm->ifindex = ifindex; + + switch (cmd) { + case RTM_NEWVLAN: + /* need to find the vlan due to flags/options */ + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) + goto out_kfree; + + flags = v->flags; + if (br_get_pvid(vg) == v->vid) + flags |= BRIDGE_VLAN_INFO_PVID; + break; + case RTM_DELVLAN: + break; + default: + goto out_kfree; + } + + if (!br_vlan_fill_vids(skb, vid, vid_range, flags)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); + return; + +out_err: + rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); +out_kfree: + kfree_skb(skb); +} + /* check if v_curr can enter a range ending in range_end */ static bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) -- cgit v1.2.3 From f545923b4a6bd7abedac50ea3fce76c713be1b74 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 14 Jan 2020 19:56:14 +0200 Subject: net: bridge: vlan: notify on vlan add/delete/change flags Now that we can notify, send a notification on add/del or change of flags. Notifications are also compressed when possible to reduce their number and relieve user-space of extra processing, due to that we have to manually notify after each add/del in order to avoid double notifications. We try hard to notify only about the vlans which actually changed, thus a single command can result in multiple notifications about disjoint ranges if there were vlans which didn't change inside. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 34 ++++++++++++++++++++--- net/bridge/br_private.h | 12 +++++++++ net/bridge/br_vlan.c | 71 ++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 99 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index b3da4f46dc64..43dab4066f91 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -568,9 +568,14 @@ int br_process_vlan_info(struct net_bridge *br, bool *changed, struct netlink_ext_ack *extack) { + int err, rtm_cmd; + if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; + /* needed for vlan-only NEWVLAN/DELVLAN notifications */ + rtm_cmd = br_afspec_cmd_to_rtm(cmd); + if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; @@ -580,7 +585,7 @@ int br_process_vlan_info(struct net_bridge *br, if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; - int v, err; + int v, v_change_start = 0; if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; @@ -588,18 +593,41 @@ int br_process_vlan_info(struct net_bridge *br, memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { + bool curr_change = false; + tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed, + err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; + if (curr_change) { + *changed = curr_change; + if (!v_change_start) + v_change_start = v; + } else { + /* nothing to notify yet */ + if (!v_change_start) + continue; + br_vlan_notify(br, p, v_change_start, + v - 1, rtm_cmd); + v_change_start = 0; + } } + /* v_change_start is set only if the last/whole range changed */ + if (v_change_start) + br_vlan_notify(br, p, v_change_start, + v - 1, rtm_cmd); + *vinfo_last = NULL; return err; } - return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); + err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); + if (*changed) + br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); + + return err; } static int br_afspec(struct net_bridge *br, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ba162c8197da..a6226ff2f0cc 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -554,6 +554,18 @@ static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, return true; } +static inline int br_afspec_cmd_to_rtm(int cmd) +{ + switch (cmd) { + case RTM_SETLINK: + return RTM_NEWVLAN; + case RTM_DELLINK: + return RTM_DELVLAN; + } + + return 0; +} + static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 5d52a2604547..e4f7dd10c3f8 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -257,6 +257,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, &changed, extack); if (err) goto out_filt; + + if (changed) + br_vlan_notify(br, NULL, v->vid, 0, + RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); @@ -380,13 +384,31 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg) kfree(vg); } -static void __vlan_flush(struct net_bridge_vlan_group *vg) +static void __vlan_flush(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; + u16 v_start = 0, v_end = 0; __vlan_delete_pvid(vg, vg->pvid); - list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) + list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { + /* take care of disjoint ranges */ + if (!v_start) { + v_start = vlan->vid; + } else if (vlan->vid - v_end != 1) { + /* found range end, notify and start next one */ + br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); + v_start = vlan->vid; + } + v_end = vlan->vid; + __vlan_del(vlan); + } + + /* notify about the last/whole vlan range */ + if (v_start) + br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, @@ -716,7 +738,7 @@ void br_vlan_flush(struct net_bridge *br) ASSERT_RTNL(); vg = br_vlan_group(br); - __vlan_flush(vg); + __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); @@ -925,12 +947,15 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br) /* Disable default_pvid on all ports where it is still * configured. */ - if (vlan_default_pvid(br_vlan_group(br), pvid)) - br_vlan_delete(br, pvid); + if (vlan_default_pvid(br_vlan_group(br), pvid)) { + if (!br_vlan_delete(br, pvid)) + br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); + } list_for_each_entry(p, &br->port_list, list) { - if (vlan_default_pvid(nbp_vlan_group(p), pvid)) - nbp_vlan_delete(p, pvid); + if (vlan_default_pvid(nbp_vlan_group(p), pvid) && + !nbp_vlan_delete(p, pvid)) + br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; @@ -972,7 +997,10 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, &vlchange, extack); if (err) goto out; - br_vlan_delete(br, old_pvid); + + if (br_vlan_delete(br, old_pvid)) + br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); + br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); set_bit(0, changed); } @@ -992,7 +1020,9 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, &vlchange, extack); if (err) goto err_port; - nbp_vlan_delete(p, old_pvid); + if (nbp_vlan_delete(p, old_pvid)) + br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); + br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); set_bit(p->port_no, changed); } @@ -1007,22 +1037,28 @@ err_port: if (!test_bit(p->port_no, changed)) continue; - if (old_pvid) + if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); + br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); + } nbp_vlan_delete(p, pvid); + br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { - if (old_pvid) + if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); + br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); + } br_vlan_delete(br, pvid); + br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } @@ -1115,6 +1151,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) &changed, extack); if (ret) goto err_vlan_add; + br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; @@ -1196,7 +1233,7 @@ void nbp_vlan_flush(struct net_bridge_port *port) ASSERT_RTNL(); vg = nbp_vlan_group(port); - __vlan_flush(vg); + __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); @@ -1462,8 +1499,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); - bool changed; - int ret = 0; + int vlcmd = 0, ret = 0; + bool changed = false; switch (event) { case NETDEV_REGISTER: @@ -1471,9 +1508,11 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); + vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: - br_vlan_delete(br, br->default_pvid); + changed = !br_vlan_delete(br, br->default_pvid); + vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; @@ -1487,6 +1526,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) br_vlan_link_state_change(dev, br); break; } + if (changed) + br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } -- cgit v1.2.3 From d3a56931f9c8644a52e7a6735954c84ea895e8c3 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 14 Jan 2020 10:49:25 +0100 Subject: xsk: Support allocations of large umems When registering a umem area that is sufficiently large (>1G on an x86), kmalloc cannot be used to allocate one of the internal data structures, as the size requested gets too large. Use kvmalloc instead that falls back on vmalloc if the allocation is too large for kmalloc. Also add accounting for this structure as it is triggered by a user space action (the XDP_UMEM_REG setsockopt) and it is by far the largest structure of kernel allocated memory in xsk. Reported-by: Ryan Goodfellow Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1578995365-7050-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xdp_umem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 3049af269fbf..f93e917e0929 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -249,7 +249,7 @@ static void xdp_umem_release(struct xdp_umem *umem) xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); - kfree(umem->pages); + kvfree(umem->pages); umem->pages = NULL; xdp_umem_unaccount_pages(umem); @@ -409,7 +409,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); + umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), + GFP_KERNEL_ACCOUNT); if (!umem->pages) { err = -ENOMEM; goto out_pin; @@ -419,7 +420,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (!err) return 0; - kfree(umem->pages); + kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); -- cgit v1.2.3 From 600a87490ff9823d065fc15e86c709e707033ecc Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Tue, 7 Jan 2020 00:43:17 +0000 Subject: Bluetooth: Implementation of MGMT_OP_SET_BLOCKED_KEYS. MGMT command is added to receive the list of blocked keys from user-space. The list is used to: 1) Block keys from being distributed by the device during the ke distribution phase of SMP. 2) Filter out any keys that were previously saved so they are no longer used. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 10 +++++ include/net/bluetooth/mgmt.h | 17 ++++++++ net/bluetooth/hci_core.c | 85 ++++++++++++++++++++++++++++++++++++---- net/bluetooth/hci_debugfs.c | 17 ++++++++ net/bluetooth/mgmt.c | 76 +++++++++++++++++++++++++++++++++++ net/bluetooth/smp.c | 18 +++++++++ 6 files changed, 215 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index faebe3859931..89ecf0a80aa1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -118,6 +118,13 @@ struct bt_uuid { u8 svc_hint; }; +struct blocked_key { + struct list_head list; + struct rcu_head rcu; + u8 type; + u8 val[16]; +}; + struct smp_csrk { bdaddr_t bdaddr; u8 bdaddr_type; @@ -397,6 +404,7 @@ struct hci_dev { struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; + struct list_head blocked_keys; struct hci_dev_stats stat; @@ -1123,6 +1131,8 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa); void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type); +bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]); +void hci_blocked_keys_clear(struct hci_dev *hdev); void hci_smp_irks_clear(struct hci_dev *hdev); bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 9cee7ddc6741..a90666af05bd 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -654,6 +654,23 @@ struct mgmt_cp_set_phy_confguration { } __packed; #define MGMT_SET_PHY_CONFIGURATION_SIZE 4 +#define MGMT_OP_SET_BLOCKED_KEYS 0x0046 + +#define HCI_BLOCKED_KEY_TYPE_LINKKEY 0x00 +#define HCI_BLOCKED_KEY_TYPE_LTK 0x01 +#define HCI_BLOCKED_KEY_TYPE_IRK 0x02 + +struct mgmt_blocked_key_info { + __u8 type; + __u8 val[16]; +} __packed; + +struct mgmt_cp_set_blocked_keys { + __le16 key_count; + struct mgmt_blocked_key_info keys[0]; +} __packed; +#define MGMT_OP_SET_BLOCKED_KEYS_SIZE 2 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9e19d5a3aac8..f0298db26dc3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2311,6 +2311,33 @@ void hci_smp_irks_clear(struct hci_dev *hdev) } } +void hci_blocked_keys_clear(struct hci_dev *hdev) +{ + struct blocked_key *b; + + list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { + list_del_rcu(&b->list); + kfree_rcu(b, rcu); + } +} + +bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) +{ + bool blocked = false; + struct blocked_key *b; + + rcu_read_lock(); + list_for_each_entry(b, &hdev->blocked_keys, list) { + if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { + blocked = true; + break; + } + } + + rcu_read_unlock(); + return blocked; +} + struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; @@ -2319,6 +2346,16 @@ struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); + + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_LINKKEY, + k->val)) { + bt_dev_warn_ratelimited(hdev, + "Link key blocked for %pMR", + &k->bdaddr); + return NULL; + } + return k; } } @@ -2387,6 +2424,15 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); + + if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, + k->val)) { + bt_dev_warn_ratelimited(hdev, + "LTK blocked for %pMR", + &k->bdaddr); + return NULL; + } + return k; } } @@ -2397,31 +2443,42 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { + struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { - rcu_read_unlock(); - return irk; + irk_to_return = irk; + goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); - rcu_read_unlock(); - return irk; + irk_to_return = irk; + goto done; } } + +done: + if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, + irk_to_return->val)) { + bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", + &irk_to_return->bdaddr); + irk_to_return = NULL; + } + rcu_read_unlock(); - return NULL; + return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { + struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ @@ -2432,13 +2489,23 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { - rcu_read_unlock(); - return irk; + irk_to_return = irk; + goto done; } } + +done: + + if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, + irk_to_return->val)) { + bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", + &irk_to_return->bdaddr); + irk_to_return = NULL; + } + rcu_read_unlock(); - return NULL; + return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, @@ -3244,6 +3311,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); + INIT_LIST_HEAD(&hdev->blocked_keys); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); @@ -3443,6 +3511,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); + hci_blocked_keys_clear(hdev); hci_dev_unlock(hdev); hci_dev_put(hdev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 402e2cc54044..1c8100bc4e04 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -152,6 +152,21 @@ static int blacklist_show(struct seq_file *f, void *p) DEFINE_SHOW_ATTRIBUTE(blacklist); +static int blocked_keys_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + struct blocked_key *key; + + rcu_read_lock(); + list_for_each_entry_rcu(key, &hdev->blocked_keys, list) + seq_printf(f, "%u %*phN\n", key->type, 16, key->val); + rcu_read_unlock(); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(blocked_keys); + static int uuids_show(struct seq_file *f, void *p) { struct hci_dev *hdev = f->private; @@ -308,6 +323,8 @@ void hci_debugfs_create_common(struct hci_dev *hdev) &device_list_fops); debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev, &blacklist_fops); + debugfs_create_file("blocked_keys", 0444, hdev->debugfs, hdev, + &blocked_keys_fops); debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops); debugfs_create_file("remote_oob", 0400, hdev->debugfs, hdev, &remote_oob_fops); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index acb7c6d5643f..339c762eb6fd 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -106,6 +106,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_START_LIMITED_DISCOVERY, MGMT_OP_READ_EXT_INFO, MGMT_OP_SET_APPEARANCE, + MGMT_OP_SET_BLOCKED_KEYS, }; static const u16 mgmt_events[] = { @@ -2341,6 +2342,14 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_LINKKEY, + key->val)) { + bt_dev_warn(hdev, "Skipping blocked link key for %pMR", + &key->addr.bdaddr); + continue; + } + /* Always ignore debug keys and require a new pairing if * the user wants to use them. */ @@ -3531,6 +3540,55 @@ unlock: return err; } +static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + int err = MGMT_STATUS_SUCCESS; + struct mgmt_cp_set_blocked_keys *keys = data; + const u16 max_key_count = ((U16_MAX - sizeof(*keys)) / + sizeof(struct mgmt_blocked_key_info)); + u16 key_count, expected_len; + int i; + + BT_DBG("request for %s", hdev->name); + + key_count = __le16_to_cpu(keys->key_count); + if (key_count > max_key_count) { + bt_dev_err(hdev, "too big key_count value %u", key_count); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } + + expected_len = struct_size(keys, keys, key_count); + if (expected_len != len) { + bt_dev_err(hdev, "expected %u bytes, got %u bytes", + expected_len, len); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } + + hci_dev_lock(hdev); + + hci_blocked_keys_clear(hdev); + + for (i = 0; i < keys->key_count; ++i) { + struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL); + + if (!b) { + err = MGMT_STATUS_NO_RESOURCES; + break; + } + + b->type = keys->keys[i].type; + memcpy(b->val, keys->keys[i].val, sizeof(b->val)); + list_add_rcu(&b->list, &hdev->blocked_keys); + } + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, + err, NULL, 0); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -5051,6 +5109,14 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *irk = &cp->irks[i]; + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_IRK, + irk->val)) { + bt_dev_warn(hdev, "Skipping blocked IRK for %pMR", + &irk->addr.bdaddr); + continue; + } + hci_add_irk(hdev, &irk->addr.bdaddr, le_addr_type(irk->addr.type), irk->val, BDADDR_ANY); @@ -5134,6 +5200,14 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, struct mgmt_ltk_info *key = &cp->keys[i]; u8 type, authenticated; + if (hci_is_blocked_key(hdev, + HCI_BLOCKED_KEY_TYPE_LTK, + key->val)) { + bt_dev_warn(hdev, "Skipping blocked LTK for %pMR", + &key->addr.bdaddr); + continue; + } + switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; @@ -6914,6 +6988,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_appearance, MGMT_SET_APPEARANCE_SIZE }, { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE }, + { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, + HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 6b42be4b5861..4ece170c518e 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2453,6 +2453,15 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; + /* Pairing is aborted if any blocked keys are distributed */ + if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK, + rp->ltk)) { + bt_dev_warn_ratelimited(conn->hcon->hdev, + "LTK blocked for %pMR", + &conn->hcon->dst); + return SMP_INVALID_PARAMS; + } + SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); skb_pull(skb, sizeof(*rp)); @@ -2509,6 +2518,15 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; + /* Pairing is aborted if any blocked keys are distributed */ + if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK, + info->irk)) { + bt_dev_warn_ratelimited(conn->hcon->hdev, + "Identity key blocked for %pMR", + &conn->hcon->dst); + return SMP_INVALID_PARAMS; + } + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); -- cgit v1.2.3 From f9a619db7c137b7c2dec0414d8deb8ec762ae8f9 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 15 Jan 2020 13:02:17 -0800 Subject: Bluetooth: monitor: Add support for ISO packets This enables passing ISO packets to the monitor socket. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_mon.h | 2 ++ net/bluetooth/hci_sock.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 240786b04a46..2d5fcda1bcd0 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -49,6 +49,8 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_CLOSE 15 #define HCI_MON_CTRL_COMMAND 16 #define HCI_MON_CTRL_EVENT 17 +#define HCI_MON_ISO_TX_PKT 18 +#define HCI_MON_ISO_RX_PKT 19 struct hci_mon_new_index { __u8 type; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 5d0ed28c0d3a..3ae508674ef7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -324,6 +324,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT); break; + case HCI_ISODATA_PKT: + if (bt_cb(skb)->incoming) + opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT); + else + opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); + break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; -- cgit v1.2.3 From 1b1d29e5149990e44634b2e681de71effd463591 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 15 Jan 2020 13:02:18 -0800 Subject: Bluetooth: Make use of __check_timeout on hci_sched_le This reuse __check_timeout on hci_sched_le following the same logic used hci_sched_acl. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f0298db26dc3..1ca7508b6ca7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4287,15 +4287,10 @@ static void hci_sched_le(struct hci_dev *hdev) if (!hci_conn_num(hdev, LE_LINK)) return; - if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { - /* LE tx timeout must be longer than maximum - * link supervision timeout (40.9 seconds) */ - if (!hdev->le_cnt && hdev->le_pkts && - time_after(jiffies, hdev->le_last_tx + HZ * 45)) - hci_link_tx_to(hdev, LE_LINK); - } - cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; + + __check_timeout(hdev, cnt); + tmp = cnt; while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { u32 priority = (skb_peek(&chan->data_q))->priority; -- cgit v1.2.3 From 117717e57440d2b5e07da40c621aa4f0ba423b80 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 15 Jan 2020 21:35:32 +0100 Subject: Bluetooth: Increment management interface revision Increment the mgmt revision due to the recently added commands. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 339c762eb6fd..0dc610faab70 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 14 +#define MGMT_REVISION 15 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, -- cgit v1.2.3 From c4c86abb3f9fb3d5bc113fb996298a30ec63e07b Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 15 Jan 2020 14:43:38 +0200 Subject: net/rds: Detect need of On-Demand-Paging memory registration Add code to check if memory intended for RDMA is FS-DAX-memory. RDS will fail with error code EOPNOTSUPP if FS-DAX-memory is detected. Signed-off-by: Hans Westgaard Ry Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky --- net/rds/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 916f5ec373d8..eb23c38ce2b3 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs) static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { + unsigned int gup_flags = FOLL_LONGTERM; int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, - pages); + if (write) + gup_flags |= FOLL_WRITE; + ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { while (ret--) put_page(pages[ret]); -- cgit v1.2.3 From 79b9b685dde1d1bf43cf84163c76953dc3781c85 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jan 2020 22:26:38 +0100 Subject: netfilter: flowtable: fetch stats only if flow is still alive Do not fetch statistics if flow has expired since it might not in hardware anymore. After this update, remove the FLOW_OFFLOAD_HW_DYING check from nf_flow_offload_stats() since this flag is never set on. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Pablo Neira Ayuso Acked-by: wenxu --- net/netfilter/nf_flow_table_core.c | 5 ++--- net/netfilter/nf_flow_table_offload.c | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index e33a73cb1f42..9e6de2bbeccb 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -348,9 +348,6 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - if (flow->flags & FLOW_OFFLOAD_HW) - nf_flow_offload_stats(flow_table, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { if (flow->flags & FLOW_OFFLOAD_HW) { @@ -361,6 +358,8 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) } else { flow_offload_del(flow_table, flow); } + } else if (flow->flags & FLOW_OFFLOAD_HW) { + nf_flow_offload_stats(flow_table, flow); } } diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index d06969af1085..4d1e81e2880f 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -784,8 +784,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10) || - flow->flags & FLOW_OFFLOAD_HW_DYING) + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) return; offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); -- cgit v1.2.3 From a7521a60a5f3e1f58a015fedb6e69aed40455feb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jan 2020 12:42:55 +0100 Subject: netfilter: flowtable: restrict flow dissector match on meta ingress device Set on FLOW_DISSECTOR_KEY_META meta key using flow tuple ingress interface. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 4d1e81e2880f..b879e673953f 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -24,6 +24,7 @@ struct flow_offload_work { }; struct nf_flow_key { + struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; union { @@ -55,6 +56,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); @@ -62,6 +64,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -105,7 +110,8 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL) | + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | BIT(FLOW_DISSECTOR_KEY_BASIC) | BIT(FLOW_DISSECTOR_KEY_PORTS); return 0; -- cgit v1.2.3 From 87265d842c59d93798b380afe3288a2bdc19bcd9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jan 2020 20:48:51 +0100 Subject: netfilter: flowtable: add nf_flow_offload_work_alloc() Add helper function to allocate and initialize flow offload work and use it to consolidate existing code. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 38 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b879e673953f..d161623107a1 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -748,21 +748,35 @@ static void flow_offload_queue_work(struct flow_offload_work *offload) schedule_work(&nf_flow_offload_work); } -void nf_flow_offload_add(struct nf_flowtable *flowtable, - struct flow_offload *flow) +static struct flow_offload_work * +nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, + struct flow_offload *flow, unsigned int cmd) { struct flow_offload_work *offload; offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); if (!offload) - return; + return NULL; - offload->cmd = FLOW_CLS_REPLACE; + offload->cmd = cmd; offload->flow = flow; offload->priority = flowtable->priority; offload->flowtable = flowtable; - flow->flags |= FLOW_OFFLOAD_HW; + return offload; +} + + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); + if (!offload) + return; + + flow->flags |= FLOW_OFFLOAD_HW; flow_offload_queue_work(offload); } @@ -771,15 +785,11 @@ void nf_flow_offload_del(struct nf_flowtable *flowtable, { struct flow_offload_work *offload; - offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); if (!offload) return; - offload->cmd = FLOW_CLS_DESTROY; - offload->flow = flow; - offload->flow->flags |= FLOW_OFFLOAD_HW_DYING; - offload->flowtable = flowtable; - + flow->flags |= FLOW_OFFLOAD_HW_DYING; flow_offload_queue_work(offload); } @@ -793,14 +803,10 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) return; - offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); if (!offload) return; - offload->cmd = FLOW_CLS_STATS; - offload->flow = flow; - offload->flowtable = flowtable; - flow_offload_queue_work(offload); } -- cgit v1.2.3 From 445db8d09659eb27bcd5920cb91d91686f0197d0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jan 2020 22:00:57 +0100 Subject: netfilter: flowtable: remove dying bit, use teardown bit instead The dying bit removes the conntrack entry if the netdev that owns this flow is going down. Instead, use the teardown mechanism to push back the flow to conntrack to let the classic software path decide what to do with it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 ----- net/netfilter/nf_flow_table_core.c | 8 +++----- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 415b8f49d150..4ad924d5f983 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -85,7 +85,6 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_SNAT 0x1 #define FLOW_OFFLOAD_DNAT 0x2 -#define FLOW_OFFLOAD_DYING 0x4 #define FLOW_OFFLOAD_TEARDOWN 0x8 #define FLOW_OFFLOAD_HW 0x10 #define FLOW_OFFLOAD_HW_DYING 0x20 @@ -134,10 +133,6 @@ int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); void flow_offload_teardown(struct flow_offload *flow); -static inline void flow_offload_dead(struct flow_offload *flow) -{ - flow->flags |= FLOW_OFFLOAD_DYING; -} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9e6de2bbeccb..a9ed93a9e007 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -182,8 +182,6 @@ void flow_offload_free(struct flow_offload *flow) default: break; } - if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(flow->ct, 0, 0); nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } @@ -300,7 +298,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + if (flow->flags & FLOW_OFFLOAD_TEARDOWN) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) @@ -349,7 +347,7 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) struct nf_flowtable *flow_table = data; if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { + (flow->flags & FLOW_OFFLOAD_TEARDOWN)) { if (flow->flags & FLOW_OFFLOAD_HW) { if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) nf_flow_offload_del(flow_table, flow); @@ -523,7 +521,7 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) - flow_offload_dead(flow); + flow_offload_teardown(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, -- cgit v1.2.3 From 355a8b13f87a8964ebe785b065f1388a1bd00c7e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jan 2020 20:41:15 +0100 Subject: netfilter: flowtable: use atomic bitwise operations for flow flags Originally, all flow flag bits were set on only from the workqueue. With the introduction of the flow teardown state and hardware offload this is no longer true. Let's be safe and use atomic bitwise operation to operation with flow flags. Fixes: 59c466dd68e7 ("netfilter: nf_flow_table: add a new flow state for tearing down offloading") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 16 +++++++++------- net/netfilter/nf_flow_table_core.c | 20 ++++++++++---------- net/netfilter/nf_flow_table_ip.c | 8 ++++---- net/netfilter/nf_flow_table_offload.c | 20 ++++++++++---------- 4 files changed, 33 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 4ad924d5f983..5a10e28c3e40 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -83,12 +83,14 @@ struct flow_offload_tuple_rhash { struct flow_offload_tuple tuple; }; -#define FLOW_OFFLOAD_SNAT 0x1 -#define FLOW_OFFLOAD_DNAT 0x2 -#define FLOW_OFFLOAD_TEARDOWN 0x8 -#define FLOW_OFFLOAD_HW 0x10 -#define FLOW_OFFLOAD_HW_DYING 0x20 -#define FLOW_OFFLOAD_HW_DEAD 0x40 +enum nf_flow_flags { + NF_FLOW_SNAT, + NF_FLOW_DNAT, + NF_FLOW_TEARDOWN, + NF_FLOW_HW, + NF_FLOW_HW_DYING, + NF_FLOW_HW_DEAD, +}; enum flow_offload_type { NF_FLOW_OFFLOAD_UNSPEC = 0, @@ -98,7 +100,7 @@ enum flow_offload_type { struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; - u16 flags; + unsigned long flags; u16 type; u32 timeout; struct rcu_head rcu_head; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index a9ed93a9e007..9f134f44d139 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -61,9 +61,9 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) - flow->flags |= FLOW_OFFLOAD_SNAT; + __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) - flow->flags |= FLOW_OFFLOAD_DNAT; + __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; @@ -269,7 +269,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, if (nf_flow_has_expired(flow)) flow_offload_fixup_ct(flow->ct); - else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) + else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); @@ -277,7 +277,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { - flow->flags |= FLOW_OFFLOAD_TEARDOWN; + set_bit(NF_FLOW_TEARDOWN, &flow->flags); flow_offload_fixup_ct_state(flow->ct); } @@ -298,7 +298,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (flow->flags & FLOW_OFFLOAD_TEARDOWN) + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) @@ -347,16 +347,16 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) struct nf_flowtable *flow_table = data; if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - (flow->flags & FLOW_OFFLOAD_TEARDOWN)) { - if (flow->flags & FLOW_OFFLOAD_HW) { - if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) + test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (test_bit(NF_FLOW_HW, &flow->flags)) { + if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); - else if (flow->flags & FLOW_OFFLOAD_HW_DEAD) + else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } - } else if (flow->flags & FLOW_OFFLOAD_HW) { + } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 7ea2ddc2aa93..f4ccb5f5008b 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -144,11 +144,11 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; @@ -414,11 +414,11 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index d161623107a1..8a1fe391666e 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -450,16 +450,16 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow->flags & FLOW_OFFLOAD_SNAT) { + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv4_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_DNAT) { + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv4_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_SNAT || - flow->flags & FLOW_OFFLOAD_DNAT) + if (test_bit(NF_FLOW_SNAT, &flow->flags) || + test_bit(NF_FLOW_DNAT, &flow->flags)) flow_offload_ipv4_checksum(net, flow, flow_rule); flow_offload_redirect(flow, dir, flow_rule); @@ -476,11 +476,11 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow->flags & FLOW_OFFLOAD_SNAT) { + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv6_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_DNAT) { + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv6_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } @@ -636,7 +636,7 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload, list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); - offload->flow->flags |= FLOW_OFFLOAD_HW_DEAD; + set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } static int flow_offload_rule_add(struct flow_offload_work *offload, @@ -723,7 +723,7 @@ static void flow_offload_work_handler(struct work_struct *work) case FLOW_CLS_REPLACE: ret = flow_offload_work_add(offload); if (ret < 0) - offload->flow->flags &= ~FLOW_OFFLOAD_HW; + __clear_bit(NF_FLOW_HW, &offload->flow->flags); break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); @@ -776,7 +776,7 @@ void nf_flow_offload_add(struct nf_flowtable *flowtable, if (!offload) return; - flow->flags |= FLOW_OFFLOAD_HW; + __set_bit(NF_FLOW_HW, &flow->flags); flow_offload_queue_work(offload); } @@ -789,7 +789,7 @@ void nf_flow_offload_del(struct nf_flowtable *flowtable, if (!offload) return; - flow->flags |= FLOW_OFFLOAD_HW_DYING; + set_bit(NF_FLOW_HW_DYING, &flow->flags); flow_offload_queue_work(offload); } -- cgit v1.2.3 From a5449cdcaac5c78d62b8bea8f79158071f23da01 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Jan 2020 09:56:27 +0100 Subject: netfilter: flowtable: add nf_flowtable_hw_offload() helper function This function checks for the NF_FLOWTABLE_HW_OFFLOAD flag, meaning that the flowtable hardware offload is enabled. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 +++++ net/netfilter/nf_flow_table_core.c | 2 +- net/netfilter/nf_flow_table_offload.c | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 5a10e28c3e40..9ee1eaeaab04 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -47,6 +47,11 @@ struct nf_flowtable { possible_net_t net; }; +static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) +{ + return flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD; +} + enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9f134f44d139..e919bafd68d1 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -243,7 +243,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD) + if (nf_flowtable_hw_offload(flow_table)) nf_flow_offload_add(flow_table, flow); return 0; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 8a1fe391666e..b4c79fbb2d82 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -812,7 +812,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { - if (flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD) + if (nf_flowtable_hw_offload(flowtable)) flush_work(&nf_flow_offload_work); } @@ -849,7 +849,7 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct flow_block_offload bo = {}; int err; - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + if (!nf_flowtable_hw_offload(flowtable)) return 0; if (!dev->netdev_ops->ndo_setup_tc) -- cgit v1.2.3 From f698fe40829b21088d323c8b0a7c626571528fc6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jan 2020 12:56:47 +0100 Subject: netfilter: flowtable: refresh flow if hardware offload fails If nf_flow_offload_add() fails to add the flow to hardware, then the NF_FLOW_HW_REFRESH flag bit is set and the flow remains in the flowtable software path. If flowtable hardware offload is enabled, this patch enqueues a new request to offload this flow to hardware. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + net/netfilter/nf_flow_table_core.c | 4 +++- net/netfilter/nf_flow_table_ip.c | 13 +++++++++++++ net/netfilter/nf_flow_table_offload.c | 14 +++++--------- 4 files changed, 22 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 9ee1eaeaab04..e0f709d9d547 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -95,6 +95,7 @@ enum nf_flow_flags { NF_FLOW_HW, NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, + NF_FLOW_HW_REFRESH, }; enum flow_offload_type { diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index e919bafd68d1..7e91989a1b55 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -243,8 +243,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - if (nf_flowtable_hw_offload(flow_table)) + if (nf_flowtable_hw_offload(flow_table)) { + __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); + } return 0; } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index f4ccb5f5008b..9e563fd3da0f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -232,6 +232,13 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + return nf_flowtable_hw_offload(flow_table) && + test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags); +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -272,6 +279,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -498,6 +508,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b4c79fbb2d82..77b129f196c6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -654,20 +654,20 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, return 0; } -static int flow_offload_work_add(struct flow_offload_work *offload) +static void flow_offload_work_add(struct flow_offload_work *offload) { struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; int err; err = nf_flow_offload_alloc(offload, flow_rule); if (err < 0) - return -ENOMEM; + return; err = flow_offload_rule_add(offload, flow_rule); + if (err < 0) + set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); nf_flow_offload_destroy(flow_rule); - - return err; } static void flow_offload_work_del(struct flow_offload_work *offload) @@ -712,7 +712,6 @@ static void flow_offload_work_handler(struct work_struct *work) { struct flow_offload_work *offload, *next; LIST_HEAD(offload_pending_list); - int ret; spin_lock_bh(&flow_offload_pending_list_lock); list_replace_init(&flow_offload_pending_list, &offload_pending_list); @@ -721,9 +720,7 @@ static void flow_offload_work_handler(struct work_struct *work) list_for_each_entry_safe(offload, next, &offload_pending_list, list) { switch (offload->cmd) { case FLOW_CLS_REPLACE: - ret = flow_offload_work_add(offload); - if (ret < 0) - __clear_bit(NF_FLOW_HW, &offload->flow->flags); + flow_offload_work_add(offload); break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); @@ -776,7 +773,6 @@ void nf_flow_offload_add(struct nf_flowtable *flowtable, if (!offload) return; - __set_bit(NF_FLOW_HW, &flow->flags); flow_offload_queue_work(offload); } -- cgit v1.2.3 From 28b3a4270c0fc064557e409111f2a678e64b6fa7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Jan 2020 12:25:10 +0100 Subject: netfilter: hashlimit: do not use indirect calls during gc no need, just use a simple boolean to indicate we want to reap all entries. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ced3fc8fad7c..bccd47cd7190 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -357,21 +357,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, return 0; } -static bool select_all(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return true; -} - -static bool select_gc(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return time_after_eq(jiffies, he->expires); -} - -static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - bool (*select)(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he)) +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; @@ -381,7 +367,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { - if ((*select)(ht, dh)) + if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); @@ -395,7 +381,7 @@ static void htable_gc(struct work_struct *work) ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); - htable_selective_cleanup(ht, select_gc); + htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); @@ -419,7 +405,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) { cancel_delayed_work_sync(&hinfo->gc_work); htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, select_all); + htable_selective_cleanup(hinfo, true); kfree(hinfo->name); vfree(hinfo); } -- cgit v1.2.3 From ae29045018c83495f8c5033afabbedc09f24d7c2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Jan 2020 19:02:14 +0100 Subject: netfilter: flowtable: add nf_flow_offload_tuple() helper Consolidate code to configure the flow_cls_offload structure into one helper function. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 47 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 77b129f196c6..3cd8dc8714e3 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -592,23 +592,25 @@ static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, cls_flow->cookie = (unsigned long)tuple; } -static int flow_offload_tuple_add(struct flow_offload_work *offload, - struct nf_flow_rule *flow_rule, - enum flow_offload_tuple_dir dir) +static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, + struct flow_offload *flow, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir, + int priority, int cmd, + struct list_head *block_cb_list) { - struct nf_flowtable *flowtable = offload->flowtable; struct flow_cls_offload cls_flow = {}; struct flow_block_cb *block_cb; struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_REPLACE, - &offload->flow->tuplehash[dir].tuple, &extack); - cls_flow.rule = flow_rule->rule; + nf_flow_offload_init(&cls_flow, proto, priority, cmd, + &flow->tuplehash[dir].tuple, &extack); + if (cmd == FLOW_CLS_REPLACE) + cls_flow.rule = flow_rule->rule; - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) { + list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); if (err < 0) @@ -620,23 +622,22 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload, return i; } +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + return nf_flow_offload_tuple(offload->flowtable, offload->flow, + flow_rule, dir, offload->priority, + FLOW_CLS_REPLACE, + &offload->flowtable->flow_block.cb_list); +} + static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { - struct nf_flowtable *flowtable = offload->flowtable; - struct flow_cls_offload cls_flow = {}; - struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; - - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_DESTROY, - &offload->flow->tuplehash[dir].tuple, &extack); - - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) - block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); - - set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_DESTROY, + &offload->flowtable->flow_block.cb_list); } static int flow_offload_rule_add(struct flow_offload_work *offload, -- cgit v1.2.3 From a7965d58ddab0253ddfae58bd5c7d2de46ef0f76 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Jan 2020 19:02:22 +0100 Subject: netfilter: flowtable: add nf_flow_table_offload_cmd() Split nf_flow_table_offload_setup() in two functions to make it more maintainable. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 40 ++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 3cd8dc8714e3..c8b70ffeef0c 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -838,12 +838,12 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, return err; } -int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd) +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) { - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; int err; if (!nf_flowtable_hw_offload(flowtable)) @@ -852,14 +852,30 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, if (!dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; - bo.net = dev_net(dev); - bo.block = &flowtable->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + memset(bo, 0, sizeof(*bo)); + bo->net = dev_net(dev); + bo->block = &flowtable->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + if (err < 0) + return err; + + return 0; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, &bo); + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); if (err < 0) return err; -- cgit v1.2.3 From fbf19ddf396b3526718a64e24ace316c64d3ed1f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:48 +0000 Subject: netfilter: nf_tables: white-space fixes. Indentation fixes for the parameters of a few nft functions. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 4 ++-- net/netfilter/nft_set_bitmap.c | 4 ++-- net/netfilter/nft_set_hash.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 10e9d50e4e19..e8ca1ec105f8 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -130,8 +130,8 @@ nla_put_failure: static struct nft_data zero; static int nft_bitwise_offload(struct nft_offload_ctx *ctx, - struct nft_flow_rule *flow, - const struct nft_expr *expr) + struct nft_flow_rule *flow, + const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 087a056e34d1..87e8d9ba0c9b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -259,8 +259,8 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[], } static int nft_bitmap_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const nla[]) + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index b331a3c9a3a8..d350a7cd3af0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -645,7 +645,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, } static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) + struct nft_set_estimate *est) { if (!desc->size) return false; -- cgit v1.2.3 From 265ec7b0e8c35ea70e694da43aa8896e2614def9 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:49 +0000 Subject: netfilter: bitwise: remove NULL comparisons from attribute checks. In later patches, we will be adding more checks. In order to be consistent and prevent complaints from checkpatch.pl, replace the existing comparisons with NULL with logical NOT operators. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index e8ca1ec105f8..85605fb1e360 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -52,11 +52,11 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, u32 len; int err; - if (tb[NFTA_BITWISE_SREG] == NULL || - tb[NFTA_BITWISE_DREG] == NULL || - tb[NFTA_BITWISE_LEN] == NULL || - tb[NFTA_BITWISE_MASK] == NULL || - tb[NFTA_BITWISE_XOR] == NULL) + if (!tb[NFTA_BITWISE_SREG] || + !tb[NFTA_BITWISE_DREG] || + !tb[NFTA_BITWISE_LEN] || + !tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); -- cgit v1.2.3 From 577c734a81e2f59aa4490071ebe074fc05d5540b Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:50 +0000 Subject: netfilter: bitwise: replace gotos with returns. When dumping a bitwise expression, if any of the puts fails, we use goto to jump to a label. However, no clean-up is required and the only statement at the label is a return. Drop the goto's and return immediately instead. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 85605fb1e360..c15e9beb5243 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -107,24 +107,21 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) const struct nft_bitwise *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) - goto nla_put_failure; + return -1; if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) - goto nla_put_failure; + return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) - goto nla_put_failure; + return -1; if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return -1; if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return -1; return 0; - -nla_put_failure: - return -1; } static struct nft_data zero; -- cgit v1.2.3 From 9d1f979986c2e29632b6a8f7a8ef8b3c7d24a48c Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:51 +0000 Subject: netfilter: bitwise: add NFTA_BITWISE_OP netlink attribute. Add a new bitwise netlink attribute, NFTA_BITWISE_OP, which is set to a value of a new enum, nft_bitwise_ops. It describes the type of operation an expression contains. Currently, it only has one value: NFT_BITWISE_BOOL. More values will be added later to implement shifts. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 12 ++++++++++++ net/netfilter/nft_bitwise.c | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index dd4611767933..0cddf357281f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -484,6 +484,16 @@ enum nft_immediate_attributes { }; #define NFTA_IMMEDIATE_MAX (__NFTA_IMMEDIATE_MAX - 1) +/** + * enum nft_bitwise_ops - nf_tables bitwise operations + * + * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and + * XOR boolean operations + */ +enum nft_bitwise_ops { + NFT_BITWISE_BOOL, +}; + /** * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes * @@ -492,6 +502,7 @@ enum nft_immediate_attributes { * @NFTA_BITWISE_LEN: length of operands (NLA_U32) * @NFTA_BITWISE_MASK: mask value (NLA_NESTED: nft_data_attributes) * @NFTA_BITWISE_XOR: xor value (NLA_NESTED: nft_data_attributes) + * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops) * * The bitwise expression performs the following operation: * @@ -512,6 +523,7 @@ enum nft_bitwise_attributes { NFTA_BITWISE_LEN, NFTA_BITWISE_MASK, NFTA_BITWISE_XOR, + NFTA_BITWISE_OP, __NFTA_BITWISE_MAX }; #define NFTA_BITWISE_MAX (__NFTA_BITWISE_MAX - 1) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index c15e9beb5243..6948df7b0587 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -18,6 +18,7 @@ struct nft_bitwise { enum nft_registers sreg:8; enum nft_registers dreg:8; + enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; struct nft_data xor; @@ -41,6 +42,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, + [NFTA_BITWISE_OP] = { .type = NLA_U32 }, }; static int nft_bitwise_init(const struct nft_ctx *ctx, @@ -76,6 +78,18 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (tb[NFTA_BITWISE_OP]) { + priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); + switch (priv->op) { + case NFT_BITWISE_BOOL: + break; + default: + return -EOPNOTSUPP; + } + } else { + priv->op = NFT_BITWISE_BOOL; + } + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, tb[NFTA_BITWISE_MASK]); if (err < 0) @@ -112,6 +126,8 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) return -1; + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op))) + return -1; if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) -- cgit v1.2.3 From 3f8d9eb032ec76c35344a2453c4c4a0a29805e3f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:52 +0000 Subject: netfilter: bitwise: add helper for initializing boolean operations. Split the code specific to initializing bitwise boolean operations out into a separate function. A similar function will be added later for shift operations. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 66 ++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 6948df7b0587..d0cc5f753e52 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -45,20 +45,53 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_OP] = { .type = NLA_U32 }, }; +static int nft_bitwise_init_bool(struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + struct nft_data_desc d1, d2; + int err; + + if (!tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, + tb[NFTA_BITWISE_MASK]); + if (err < 0) + return err; + if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { + err = -EINVAL; + goto err1; + } + + err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, + tb[NFTA_BITWISE_XOR]); + if (err < 0) + goto err1; + if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { + err = -EINVAL; + goto err2; + } + + return 0; +err2: + nft_data_release(&priv->xor, d2.type); +err1: + nft_data_release(&priv->mask, d1.type); + return err; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_bitwise *priv = nft_expr_priv(expr); - struct nft_data_desc d1, d2; u32 len; int err; if (!tb[NFTA_BITWISE_SREG] || !tb[NFTA_BITWISE_DREG] || - !tb[NFTA_BITWISE_LEN] || - !tb[NFTA_BITWISE_MASK] || - !tb[NFTA_BITWISE_XOR]) + !tb[NFTA_BITWISE_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); @@ -90,29 +123,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->op = NFT_BITWISE_BOOL; } - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, - tb[NFTA_BITWISE_MASK]); - if (err < 0) - return err; - if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { - err = -EINVAL; - goto err1; - } - - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, - tb[NFTA_BITWISE_XOR]); - if (err < 0) - goto err1; - if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { - err = -EINVAL; - goto err2; + switch(priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_init_bool(priv, tb); + break; } - return 0; -err2: - nft_data_release(&priv->xor, d2.type); -err1: - nft_data_release(&priv->mask, d1.type); return err; } -- cgit v1.2.3 From 71d6ded3ac496f9a6563752917921b7db0f38a34 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:53 +0000 Subject: netfilter: bitwise: add helper for evaluating boolean operations. Split the code specific to evaluating bitwise boolean operations out into a separate function. Similar functions will be added later for shift operations. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index d0cc5f753e52..88a91c63f213 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -24,16 +24,27 @@ struct nft_bitwise { struct nft_data xor; }; +static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); const u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) - dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; + switch (priv->op) { + case NFT_BITWISE_BOOL: + nft_bitwise_eval_bool(dst, src, priv); + break; + } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { -- cgit v1.2.3 From 4d57ca2be146bdb272b57892ec59cd273f9ef3fc Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:54 +0000 Subject: netfilter: bitwise: add helper for dumping boolean operations. Split the code specific to dumping bitwise boolean operations out into a separate function. A similar function will be added later for shift operations. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 88a91c63f213..41265134cf0b 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -143,9 +143,24 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return err; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, + NFT_DATA_VALUE, priv->len) < 0) + return -1; + + if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, + NFT_DATA_VALUE, priv->len) < 0) + return -1; + + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); + int err = 0; if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) return -1; @@ -156,15 +171,13 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op))) return -1; - if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, - NFT_DATA_VALUE, priv->len) < 0) - return -1; - - if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, - NFT_DATA_VALUE, priv->len) < 0) - return -1; + switch (priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_dump_bool(skb, priv); + break; + } - return 0; + return err; } static struct nft_data zero; -- cgit v1.2.3 From ed991d43634b11e759a800850ba9656b5b467a56 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:55 +0000 Subject: netfilter: bitwise: only offload boolean operations. Only boolean operations supports offloading, so check the type of the operation and return an error for other types. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 41265134cf0b..b4619d9989ea 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -189,6 +189,9 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + if (priv->op != NFT_BITWISE_BOOL) + return -EOPNOTSUPP; + if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || priv->sreg != priv->dreg || priv->len != reg->len) return -EOPNOTSUPP; -- cgit v1.2.3 From 779f725e142cd987a69296c0eb7d416ee3ba57dd Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:56 +0000 Subject: netfilter: bitwise: add NFTA_BITWISE_DATA attribute. Add a new bitwise netlink attribute that will be used by shift operations to store the size of the shift. It is not used by boolean operations. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nft_bitwise.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0cddf357281f..8bef0620bc4f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -503,6 +503,8 @@ enum nft_bitwise_ops { * @NFTA_BITWISE_MASK: mask value (NLA_NESTED: nft_data_attributes) * @NFTA_BITWISE_XOR: xor value (NLA_NESTED: nft_data_attributes) * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops) + * @NFTA_BITWISE_DATA: argument for non-boolean operations + * (NLA_NESTED: nft_data_attributes) * * The bitwise expression performs the following operation: * @@ -524,6 +526,7 @@ enum nft_bitwise_attributes { NFTA_BITWISE_MASK, NFTA_BITWISE_XOR, NFTA_BITWISE_OP, + NFTA_BITWISE_DATA, __NFTA_BITWISE_MAX }; #define NFTA_BITWISE_MAX (__NFTA_BITWISE_MAX - 1) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index b4619d9989ea..744008a527fb 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -22,6 +22,7 @@ struct nft_bitwise { u8 len; struct nft_data mask; struct nft_data xor; + struct nft_data data; }; static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, @@ -54,6 +55,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; static int nft_bitwise_init_bool(struct nft_bitwise *priv, @@ -62,6 +64,9 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, struct nft_data_desc d1, d2; int err; + if (tb[NFTA_BITWISE_DATA]) + return -EINVAL; + if (!tb[NFTA_BITWISE_MASK] || !tb[NFTA_BITWISE_XOR]) return -EINVAL; -- cgit v1.2.3 From 567d746b55bc66d3800c9ae91d50f0c5deb2fd93 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Wed, 15 Jan 2020 20:05:57 +0000 Subject: netfilter: bitwise: add support for shifts. Hitherto nft_bitwise has only supported boolean operations: NOT, AND, OR and XOR. Extend it to do shifts as well. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 9 +++- net/netfilter/nft_bitwise.c | 77 ++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8bef0620bc4f..261864736b26 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -489,9 +489,13 @@ enum nft_immediate_attributes { * * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and * XOR boolean operations + * @NFT_BITWISE_LSHIFT: left-shift operation + * @NFT_BITWISE_RSHIFT: right-shift operation */ enum nft_bitwise_ops { NFT_BITWISE_BOOL, + NFT_BITWISE_LSHIFT, + NFT_BITWISE_RSHIFT, }; /** @@ -506,11 +510,12 @@ enum nft_bitwise_ops { * @NFTA_BITWISE_DATA: argument for non-boolean operations * (NLA_NESTED: nft_data_attributes) * - * The bitwise expression performs the following operation: + * The bitwise expression supports boolean and shift operations. It implements + * the boolean operations by performing the following operation: * * dreg = (sreg & mask) ^ xor * - * which allow to express all bitwise operations: + * with these mask and xor values: * * mask xor * NOT: 1 1 diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 744008a527fb..0ed2281f03be 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -34,6 +34,32 @@ static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; } +static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { + dst[i - 1] = (src[i - 1] << shift) | carry; + carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + } +} + +static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { + dst[i] = carry | (src[i] >> shift); + carry = src[i] << (BITS_PER_TYPE(u32) - shift); + } +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -45,6 +71,12 @@ void nft_bitwise_eval(const struct nft_expr *expr, case NFT_BITWISE_BOOL: nft_bitwise_eval_bool(dst, src, priv); break; + case NFT_BITWISE_LSHIFT: + nft_bitwise_eval_lshift(dst, src, priv); + break; + case NFT_BITWISE_RSHIFT: + nft_bitwise_eval_rshift(dst, src, priv); + break; } } @@ -97,6 +129,32 @@ err1: return err; } +static int nft_bitwise_init_shift(struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + struct nft_data_desc d; + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_DATA]) + return -EINVAL; + + err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || + priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, d.type); + return -EINVAL; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -131,6 +189,8 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { case NFT_BITWISE_BOOL: + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: break; default: return -EOPNOTSUPP; @@ -143,6 +203,10 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, case NFT_BITWISE_BOOL: err = nft_bitwise_init_bool(priv, tb); break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_init_shift(priv, tb); + break; } return err; @@ -162,6 +226,15 @@ static int nft_bitwise_dump_bool(struct sk_buff *skb, return 0; } +static int nft_bitwise_dump_shift(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); @@ -180,6 +253,10 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) case NFT_BITWISE_BOOL: err = nft_bitwise_dump_bool(skb, priv); break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_dump_shift(skb, priv); + break; } return err; -- cgit v1.2.3 From 75ccae62cb8d42a619323a85c577107b8b37d797 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 16 Jan 2020 16:14:44 +0100 Subject: xdp: Move devmap bulk queue into struct net_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 96360004b862 ("xdp: Make devmap flush_list common for all map instances"), changed devmap flushing to be a global operation instead of a per-map operation. However, the queue structure used for bulking was still allocated as part of the containing map. This patch moves the devmap bulk queue into struct net_device. The motivation for this is reusing it for the non-map variant of XDP_REDIRECT, which will be changed in a subsequent commit. To avoid other fields of struct net_device moving to different cache lines, we also move a couple of other members around. We defer the actual allocation of the bulk queue structure until the NETDEV_REGISTER notification devmap.c. This makes it possible to check for ndo_xdp_xmit support before allocating the structure, which is not possible at the time struct net_device is allocated. However, we keep the freeing in free_netdev() to avoid adding another RCU callback on NETDEV_UNREGISTER. Because of this change, we lose the reference back to the map that originated the redirect, so change the tracepoint to always return 0 as the map ID and index. Otherwise no functional change is intended with this patch. After this patch, the relevant part of struct net_device looks like this, according to pahole: /* --- cacheline 14 boundary (896 bytes) --- */ struct netdev_queue * _tx __attribute__((__aligned__(64))); /* 896 8 */ unsigned int num_tx_queues; /* 904 4 */ unsigned int real_num_tx_queues; /* 908 4 */ struct Qdisc * qdisc; /* 912 8 */ unsigned int tx_queue_len; /* 920 4 */ spinlock_t tx_global_lock; /* 924 4 */ struct xdp_dev_bulk_queue * xdp_bulkq; /* 928 8 */ struct xps_dev_maps * xps_cpus_map; /* 936 8 */ struct xps_dev_maps * xps_rxqs_map; /* 944 8 */ struct mini_Qdisc * miniq_egress; /* 952 8 */ /* --- cacheline 15 boundary (960 bytes) --- */ struct hlist_head qdisc_hash[16]; /* 960 128 */ /* --- cacheline 17 boundary (1088 bytes) --- */ struct timer_list watchdog_timer; /* 1088 40 */ /* XXX last struct has 4 bytes of padding */ int watchdog_timeo; /* 1128 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head todo_list; /* 1136 16 */ /* --- cacheline 18 boundary (1152 bytes) --- */ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768397.1458396.12673224324627072349.stgit@toke.dk --- include/linux/netdevice.h | 13 ++++++---- include/trace/events/xdp.h | 2 +- kernel/bpf/devmap.c | 63 ++++++++++++++++++++-------------------------- net/core/dev.c | 2 ++ 4 files changed, 38 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2741aa35bec6..5ec3537fbdb1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -876,6 +876,7 @@ enum bpf_netdev_command { struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; +struct xdp_dev_bulk_queue; struct netdev_bpf { enum bpf_netdev_command command; @@ -1986,12 +1987,10 @@ struct net_device { unsigned int num_tx_queues; unsigned int real_num_tx_queues; struct Qdisc *qdisc; -#ifdef CONFIG_NET_SCHED - DECLARE_HASHTABLE (qdisc_hash, 4); -#endif unsigned int tx_queue_len; spinlock_t tx_global_lock; - int watchdog_timeo; + + struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_cpus_map; @@ -2001,11 +2000,15 @@ struct net_device { struct mini_Qdisc __rcu *miniq_egress; #endif +#ifdef CONFIG_NET_SCHED + DECLARE_HASHTABLE (qdisc_hash, 4); +#endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; + int watchdog_timeo; - int __percpu *pcpu_refcnt; struct list_head todo_list; + int __percpu *pcpu_refcnt; struct list_head link_watch_list; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index a7378bcd9928..72bad13d4a3c 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -278,7 +278,7 @@ TRACE_EVENT(xdp_devmap_xmit, ), TP_fast_assign( - __entry->map_id = map->id; + __entry->map_id = map ? map->id : 0; __entry->act = XDP_REDIRECT; __entry->map_index = map_index; __entry->drops = drops; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index da9c832fc5c8..030d125c3839 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,13 +53,11 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 -struct bpf_dtab_netdev; - -struct xdp_bulk_queue { +struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; + struct net_device *dev; struct net_device *dev_rx; - struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -67,9 +65,8 @@ struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; - struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; - unsigned int idx; /* keep track of map index for tracepoint */ + unsigned int idx; }; struct bpf_dtab { @@ -219,7 +216,6 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -234,7 +230,6 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -320,10 +315,9 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) +static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { - struct bpf_dtab_netdev *obj = bq->obj; - struct net_device *dev = obj->dev; + struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; @@ -346,8 +340,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, - sent, drops, bq->dev_rx, dev, err); + trace_xdp_devmap_xmit(NULL, 0, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); return 0; @@ -374,7 +367,7 @@ error: void __dev_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq, *tmp; + struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) @@ -401,12 +394,12 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, +static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -444,7 +437,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf, dev_rx); + return bq_enqueue(dev, xdpf, dev_rx); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -483,7 +476,6 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -538,30 +530,15 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, u32 ifindex, unsigned int idx) { - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev; - struct xdp_bulk_queue *bq; - int cpu; - dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { - free_percpu(dev->bulkq); kfree(dev); return ERR_PTR(-EINVAL); } @@ -721,9 +698,23 @@ static int dev_map_notification(struct notifier_block *notifier, { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; - int i; + int i, cpu; switch (event) { + case NETDEV_REGISTER: + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) + break; + + /* will be freed in free_netdev() */ + netdev->xdp_bulkq = + __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), + sizeof(void *), GFP_ATOMIC); + if (!netdev->xdp_bulkq) + return NOTIFY_BAD; + + for_each_possible_cpu(cpu) + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete diff --git a/net/core/dev.c b/net/core/dev.c index d99f88c58636..e7802a41ae7f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9847,6 +9847,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; netdev_unregister_lockdep_key(dev); -- cgit v1.2.3 From 1d233886dd904edbf239eeffe435c3308ae97625 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 16 Jan 2020 16:14:45 +0100 Subject: xdp: Use bulking for non-map XDP_REDIRECT and consolidate code paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the bulk queue used by XDP_REDIRECT now lives in struct net_device, we can re-use the bulking for the non-map version of the bpf_redirect() helper. This is a simple matter of having xdp_do_redirect_slow() queue the frame on the bulk queue instead of sending it out with __bpf_tx_xdp(). Unfortunately we can't make the bpf_redirect() helper return an error if the ifindex doesn't exit (as bpf_redirect_map() does), because we don't have a reference to the network namespace of the ingress device at the time the helper is called. So we have to leave it as-is and keep the device lookup in xdp_do_redirect_slow(). Since this leaves less reason to have the non-map redirect code in a separate function, so we get rid of the xdp_do_redirect_slow() function entirely. This does lose us the tracepoint disambiguation, but fortunately the xdp_redirect and xdp_redirect_map tracepoints use the same tracepoint entry structures. This means both can contain a map index, so we can just amend the tracepoint definitions so we always emit the xdp_redirect(_err) tracepoints, but with the map ID only populated if a map is present. This means we retire the xdp_redirect_map(_err) tracepoints entirely, but keep the definitions around in case someone is still listening for them. With this change, the performance of the xdp_redirect sample program goes from 5Mpps to 8.4Mpps (a 68% increase). Since the flush functions are no longer map-specific, rename the flush() functions to drop _map from their names. One of the renamed functions is the xdp_do_flush_map() callback used in all the xdp-enabled drivers. To keep from having to update all drivers, use a #define to keep the old name working, and only update the virtual drivers in this patch. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/157918768505.1458396.17518057312953572912.stgit@toke.dk --- drivers/net/tun.c | 4 +- drivers/net/veth.c | 2 +- drivers/net/virtio_net.c | 2 +- include/linux/bpf.h | 13 +++++- include/linux/filter.h | 10 ++++- include/trace/events/xdp.h | 101 ++++++++++++++++++++------------------------- kernel/bpf/devmap.c | 32 +++++++++----- net/core/filter.c | 90 +++++++++------------------------------- 8 files changed, 108 insertions(+), 146 deletions(-) (limited to 'net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 683d371e6e82..3a5a6c655dda 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1718,7 +1718,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, if (err < 0) goto err_xdp; if (err == XDP_REDIRECT) - xdp_do_flush_map(); + xdp_do_flush(); if (err != XDP_PASS) goto out; @@ -2549,7 +2549,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) } if (flush) - xdp_do_flush_map(); + xdp_do_flush(); rcu_read_unlock(); local_bh_enable(); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a552df37a347..1c89017beebb 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -769,7 +769,7 @@ static int veth_poll(struct napi_struct *napi, int budget) if (xdp_xmit & VETH_XDP_TX) veth_xdp_flush(rq->dev, &bq); if (xdp_xmit & VETH_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); xdp_clear_return_frame_no_direct(); return done; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4d7d5434cc5d..c458cd313281 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1432,7 +1432,7 @@ static int virtnet_poll(struct napi_struct *napi, int budget) virtqueue_napi_complete(napi, rq->vq, received); if (xdp_xmit & VIRTIO_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_sq(vi); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3517e32149a4..8e3b8f4ad183 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1056,7 +1056,9 @@ struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_flush(void); +void __dev_flush(void); +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -1169,13 +1171,20 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map return NULL; } -static inline void __dev_map_flush(void) +static inline void __dev_flush(void) { } struct xdp_buff; struct bpf_dtab_netdev; +static inline +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return 0; +} + static inline int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx) diff --git a/include/linux/filter.h b/include/linux/filter.h index a366a0b64a57..f349e2c0884c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -918,7 +918,7 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, return 0; } -/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the +/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. @@ -929,7 +929,13 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); -void xdp_do_flush_map(void); +void xdp_do_flush(void); + +/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as + * it is no longer only flushing maps. Keep this define for compatibility + * until all drivers are updated - do not use xdp_do_flush_map() in new code! + */ +#define xdp_do_flush_map xdp_do_flush void bpf_warn_invalid_xdp_action(u32 act); diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 72bad13d4a3c..b680973687b4 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -79,14 +79,26 @@ TRACE_EVENT(xdp_bulk_tx, __entry->sent, __entry->drops, __entry->err) ); +#ifndef __DEVMAP_OBJ_TYPE +#define __DEVMAP_OBJ_TYPE +struct _bpf_dtab_netdev { + struct net_device *dev; +}; +#endif /* __DEVMAP_OBJ_TYPE */ + +#define devmap_ifindex(tgt, map) \ + (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ + ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), + const void *tgt, int err, + const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), + TP_ARGS(dev, xdp, tgt, err, map, index), TP_STRUCT__entry( __field(int, prog_id) @@ -103,90 +115,65 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = to_ifindex; + __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : + index; __entry->map_id = map ? map->id : 0; - __entry->map_index = map_index; + __entry->map_index = map ? index : 0; ), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d", + TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" + " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, - __entry->err) + __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, to, 0, NULL, 0); + trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to); #define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0); + trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to); + +#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map, index); + +#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map, index); -DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map, +/* not used anymore, but kept around so as not to break old programs */ +DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" - " map_id=%d map_index=%d", - __entry->prog_id, - __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->ifindex, __entry->to_ifindex, - __entry->err, - __entry->map_id, __entry->map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); -DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, +DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, - int to_ifindex, int err, - const struct bpf_map *map, u32 map_index), - TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), - TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" - " map_id=%d map_index=%d", - __entry->prog_id, - __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), - __entry->ifindex, __entry->to_ifindex, - __entry->err, - __entry->map_id, __entry->map_index) + const void *tgt, int err, + const struct bpf_map *map, u32 index), + TP_ARGS(dev, xdp, tgt, err, map, index) ); -#ifndef __DEVMAP_OBJ_TYPE -#define __DEVMAP_OBJ_TYPE -struct _bpf_dtab_netdev { - struct net_device *dev; -}; -#endif /* __DEVMAP_OBJ_TYPE */ - -#define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) - -#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ - trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ - 0, map, idx) - -#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ - trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ - err, map, idx) - TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 030d125c3839..d5311009953f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -81,7 +81,7 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); +static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -357,16 +357,16 @@ error: goto out; } -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(void) +void __dev_flush(void) { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; rcu_read_lock(); @@ -396,9 +396,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) */ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) - { - struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -419,10 +418,9 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, - struct net_device *dev_rx) +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) { - struct net_device *dev = dst->dev; struct xdp_frame *xdpf; int err; @@ -440,6 +438,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return __xdp_enqueue(dev, xdp, dev_rx); +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + + return __xdp_enqueue(dev, xdp, dev_rx); +} + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -762,7 +774,7 @@ static int __init dev_map_init(void) register_netdevice_notifier(&dev_map_notifier); for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index d6f58dc4e1c4..17de6747d9e3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3458,58 +3458,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp(struct net_device *dev, - struct bpf_map *map, - struct xdp_buff *xdp, - u32 index) -{ - struct xdp_frame *xdpf; - int err, sent; - - if (!dev->netdev_ops->ndo_xdp_xmit) { - return -EOPNOTSUPP; - } - - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); - if (unlikely(err)) - return err; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); - if (sent <= 0) - return sent; - return 0; -} - -static noinline int -xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) -{ - struct net_device *fwd; - u32 index = ri->tgt_index; - int err; - - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->tgt_index = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; -} - static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp) { @@ -3527,13 +3475,13 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, return 0; } -void xdp_do_flush_map(void) +void xdp_do_flush(void) { - __dev_map_flush(); + __dev_flush(); __cpu_map_flush(); __xsk_map_flush(); } -EXPORT_SYMBOL_GPL(xdp_do_flush_map); +EXPORT_SYMBOL_GPL(xdp_do_flush); static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { @@ -3568,10 +3516,11 @@ void bpf_clear_redirect_map(struct bpf_map *map) } } -static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map, - struct bpf_redirect_info *ri) +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->tgt_index; void *fwd = ri->tgt_value; int err; @@ -3580,7 +3529,18 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + if (unlikely(!map)) { + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = dev_xdp_enqueue(fwd, xdp, dev); + } else { + err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + } + if (unlikely(err)) goto err; @@ -3590,18 +3550,6 @@ err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); return err; } - -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) -{ - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - - if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - - return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); -} EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, -- cgit v1.2.3 From 56f200c78ce4d94680a27a1ce97a29ebeb4f23e1 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 16 Jan 2020 21:16:46 +0100 Subject: netns: Constify exported functions Mark function parameters as 'const' where possible. Signed-off-by: Guillaume Nault Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 +++++----- net/core/net_namespace.c | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b8ceaf0cd997..854d39ef1ca3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -347,9 +347,9 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp); -int peernet2id(struct net *net, struct net *peer); -bool peernet_has_id(struct net *net, struct net *peer); -struct net *get_net_ns_by_id(struct net *net, int id); +int peernet2id(const struct net *net, struct net *peer); +bool peernet_has_id(const struct net *net, struct net *peer); +struct net *get_net_ns_by_id(const struct net *net, int id); struct pernet_operations { struct list_head list; @@ -427,7 +427,7 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header) } #endif -static inline int rt_genid_ipv4(struct net *net) +static inline int rt_genid_ipv4(const struct net *net) { return atomic_read(&net->ipv4.rt_genid); } @@ -459,7 +459,7 @@ static inline void rt_genid_bump_all(struct net *net) rt_genid_bump_ipv6(net); } -static inline int fnhe_genid(struct net *net) +static inline int fnhe_genid(const struct net *net) { return atomic_read(&net->fnhe_genid); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6412c1fbfcb5..757cc1d084e7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -268,7 +268,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ -int peernet2id(struct net *net, struct net *peer) +int peernet2id(const struct net *net, struct net *peer) { int id; @@ -283,12 +283,12 @@ EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. */ -bool peernet_has_id(struct net *net, struct net *peer) +bool peernet_has_id(const struct net *net, struct net *peer) { return peernet2id(net, peer) >= 0; } -struct net *get_net_ns_by_id(struct net *net, int id) +struct net *get_net_ns_by_id(const struct net *net, int id) { struct net *peer; -- cgit v1.2.3 From 2eafa1746f17872483d1033b0116ec71435ea19d Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 15 Jan 2020 14:43:39 +0200 Subject: net/rds: Handle ODP mr registration/unregistration On-Demand-Paging MRs are registered using ib_reg_user_mr and unregistered with ib_dereg_mr. Signed-off-by: Hans Westgaard Ry Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky --- net/rds/ib.c | 7 +++ net/rds/ib.h | 3 +- net/rds/ib_mr.h | 7 ++- net/rds/ib_rdma.c | 75 ++++++++++++++++++++++++++- net/rds/ib_send.c | 44 +++++++++++----- net/rds/rdma.c | 151 ++++++++++++++++++++++++++++++++++++++++-------------- net/rds/rds.h | 13 ++++- 7 files changed, 244 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 3fd5f40189bd..a792d8a3872a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device) has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && device->ops.map_phys_fmr && device->ops.unmap_fmr); rds_ibdev->use_fastreg = (has_fr && !has_fmr); + rds_ibdev->odp_capable = + !!(device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE) && + !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index 6e6f24753998..0296f1f7acda 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -247,7 +247,8 @@ struct rds_ib_device { struct ib_device *dev; struct ib_pd *pd; struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ - bool use_fastreg; + u8 use_fastreg:1; + u8 odp_capable:1; unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 9045a8c0edff..0c8252d7fe2b 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -67,6 +67,7 @@ struct rds_ib_frmr { /* This is stored as mr->r_trans_private. */ struct rds_ib_mr { + struct delayed_work work; struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct rds_ib_connection *ic; @@ -81,9 +82,11 @@ struct rds_ib_mr { unsigned int sg_len; int sg_dma_len; + u8 odp:1; union { struct rds_ib_fmr fmr; struct rds_ib_frmr frmr; + struct ib_mr *mr; } u; }; @@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, u64 start, u64 length, + int need_odp); void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); int rds_ib_mr_init(void); void rds_ib_mr_exit(void); +u32 rds_ib_get_lkey(void *trans_private); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index c8c1e3ae8d84..ea4d7daec251 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -37,8 +37,15 @@ #include "rds_single_path.h" #include "ib_mr.h" +#include "rds.h" struct workqueue_struct *rds_ib_mr_wq; +struct rds_ib_dereg_odp_mr { + struct work_struct work; + struct ib_mr *mr; +}; + +static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { @@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; + if (ibmr->odp) + return; + switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, @@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + if (ibmr->odp) { + /* A MR created and marked as use_once. We use delayed work, + * because there is a change that we are in interrupt and can't + * call to ib_dereg_mr() directly. + */ + INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); + queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); + return; + } + /* Return it to the pool's free list */ if (rds_ibdev->use_fastreg) rds_ib_free_frmr_list(ibmr); @@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void) up_read(&rds_ib_devices_lock); } +u32 rds_ib_get_lkey(void *trans_private) +{ + struct rds_ib_mr *ibmr = trans_private; + + return ibmr->u.mr->lkey; +} + void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn) + struct rds_connection *conn, + u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; @@ -541,6 +569,42 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } + if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { + u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; + int access_flags = + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_ON_DEMAND); + struct ib_mr *ib_mr; + + if (!rds_ibdev->odp_capable) { + ret = -EOPNOTSUPP; + goto out; + } + + ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, + access_flags); + + if (IS_ERR(ib_mr)) { + rdsdebug("rds_ib_get_user_mr returned %d\n", + IS_ERR(ib_mr)); + ret = PTR_ERR(ib_mr); + goto out; + } + if (key_ret) + *key_ret = ib_mr->rkey; + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + ib_dereg_mr(ib_mr); + ret = -ENOMEM; + goto out; + } + ibmr->u.mr = ib_mr; + ibmr->odp = 1; + return ibmr; + } + if (conn) ic = conn->c_transport_data; @@ -629,3 +693,12 @@ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } + +static void rds_ib_odp_mr_worker(struct work_struct *work) +{ + struct rds_ib_mr *ibmr; + + ibmr = container_of(work, struct rds_ib_mr, work.work); + ib_dereg_mr(ibmr->u.mr); + kfree(ibmr); +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index d1cc1d7778d8..dfe778220657 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -39,6 +39,7 @@ #include "rds_single_path.h" #include "rds.h" #include "ib.h" +#include "ib_mr.h" /* * Convert IB-specific error message to RDS error message and call core @@ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; send->s_sge[0].length = sizeof(struct rds_header); + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); @@ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_sge[1].addr = sg_dma_address(scat); send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; bytes_sent += len; rm->data.op_dmaoff += len; @@ -858,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int ret; int num_sge; int nr_sig = 0; + u64 odp_addr = op->op_odp_addr; + u32 odp_lkey = 0; /* map the op the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; + if (!op->op_odp_mr) { + if (!op->op_mapped) { + op->op_count = + ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, + op->op_nents, + (op->op_write) ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, + op->op_count); + if (op->op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + op->op_mapped = 1; } - - op->op_mapped = 1; + } else { + op->op_count = op->op_nents; + odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private); } /* @@ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) for (j = 0; j < send->s_rdma_wr.wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = sg_dma_len(scat); - send->s_sge[j].addr = sg_dma_address(scat); + if (!op->op_odp_mr) { + send->s_sge[j].addr = sg_dma_address(scat); + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; + } else { + send->s_sge[j].addr = odp_addr; + send->s_sge[j].lkey = odp_lkey; + } send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); remote_addr += len; + odp_addr += len; scat++; } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index eb23c38ce2b3..3341eee87bf9 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -177,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; + struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; - struct scatterlist *sg; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; - unsigned int nents; + unsigned int nents = 0; + int need_odp = 0; long i; int ret; @@ -197,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || + PAGE_ALIGN(args->vec.addr + args->vec.bytes) < + (args->vec.addr + args->vec.bytes)) { + ret = -EINVAL; + goto out; + } + + if (!can_do_mlock()) { + ret = -EPERM; + goto out; + } + nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; @@ -250,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); - if (ret < 0) - goto out; - - nents = ret; - sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (!sg) { - ret = -ENOMEM; + if (ret == -EOPNOTSUPP) { + need_odp = 1; + } else if (ret <= 0) { goto out; - } - WARN_ON(!nents); - sg_init_table(sg, nents); - - /* Stick all pages into the scatterlist */ - for (i = 0 ; i < nents; i++) - sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + } else { + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); - rdsdebug("RDS: trans_private nents is %u\n", nents); + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + rdsdebug("RDS: trans_private nents is %u\n", nents); + } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ - trans_private = rs->rs_transport->get_mr(sg, nents, rs, - &mr->r_key, - cp ? cp->cp_conn : NULL); + trans_private = rs->rs_transport->get_mr( + sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, + args->vec.addr, args->vec.bytes, + need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { - for (i = 0 ; i < nents; i++) - put_page(sg_page(&sg[i])); - kfree(sg); + /* In ODP case, we don't GUP pages, so don't need + * to release anything. + */ + if (!need_odp) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + } ret = PTR_ERR(trans_private); goto out; } @@ -293,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing and pass that * around. */ - cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (need_odp) + cookie = rds_rdma_make_cookie(mr->r_key, 0); + else + cookie = rds_rdma_make_cookie(mr->r_key, + args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; @@ -458,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->op_nents; i++) { - struct page *page = sg_page(&ro->op_sg[i]); - - /* Mark page dirty if it was possibly modified, which - * is the case for a RDMA_READ which copies from remote - * to local memory */ - if (!ro->op_write) { - WARN_ON(!page->mapping && irqs_disabled()); - set_page_dirty(page); + if (ro->op_odp_mr) { + rds_mr_put(ro->op_odp_mr); + } else { + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory + */ + if (!ro->op_write) + set_page_dirty(page); + put_page(page); } - put_page(page); } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; + ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) @@ -583,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_iovec *iovs; unsigned int i, j; int ret = 0; + bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) @@ -604,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = -EINVAL; goto out_ret; } + /* odp-mr is not supported for multiple requests within one message */ + if (args->nr_local != 1) + odp_supported = false; iovs = vec->iov; @@ -625,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; + op->op_odp_mr = NULL; + WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); if (!op->op_sg) @@ -674,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); - if (ret < 0) + if ((!odp_supported && ret <= 0) || + (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; - else - ret = 0; + + if (ret == -EOPNOTSUPP) { + struct rds_mr *local_odp_mr; + + if (!rs->rs_transport->get_mr) { + ret = -EOPNOTSUPP; + goto out_pages; + } + local_odp_mr = + kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); + if (!local_odp_mr) { + ret = -ENOMEM; + goto out_pages; + } + RB_CLEAR_NODE(&local_odp_mr->r_rb_node); + refcount_set(&local_odp_mr->r_refcount, 1); + local_odp_mr->r_trans = rs->rs_transport; + local_odp_mr->r_sock = rs; + local_odp_mr->r_trans_private = + rs->rs_transport->get_mr( + NULL, 0, rs, &local_odp_mr->r_key, NULL, + iov->addr, iov->bytes, ODP_VIRTUAL); + if (IS_ERR(local_odp_mr->r_trans_private)) { + ret = IS_ERR(local_odp_mr->r_trans_private); + rdsdebug("get_mr ret %d %p\"", ret, + local_odp_mr->r_trans_private); + kfree(local_odp_mr); + ret = -EOPNOTSUPP; + goto out_pages; + } + rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", + local_odp_mr, local_odp_mr->r_trans_private); + op->op_odp_mr = local_odp_mr; + op->op_odp_addr = iov->addr; + } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); @@ -693,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); + sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); @@ -711,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_pages; } op->op_bytes = nr_bytes; + ret = 0; out_pages: kfree(pages); @@ -757,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { - mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + mr->r_trans->sync_mr(mr->r_trans_private, + DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; diff --git a/net/rds/rds.h b/net/rds/rds.h index 53e86911773a..e4a603523083 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -40,7 +40,6 @@ #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif - #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else @@ -478,6 +477,9 @@ struct rds_message { struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; + + u64 op_odp_addr; + struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; @@ -573,7 +575,8 @@ struct rds_transport { void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, - struct rds_connection *conn); + struct rds_connection *conn, + u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); @@ -956,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn) (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } +enum { + ODP_NOT_NEEDED, + ODP_ZEROBASED, + ODP_VIRTUAL +}; + /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ -- cgit v1.2.3 From b2dfc6765e45a3154800333234e4952b5412d792 Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 15 Jan 2020 14:43:40 +0200 Subject: net/rds: Use prefetch for On-Demand-Paging MR Try prefetching pages when using On-Demand-Paging MR using ib_advise_mr. Signed-off-by: Hans Westgaard Ry Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky --- net/rds/ib_rdma.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index ea4d7daec251..b34b24e237f8 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -575,6 +575,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_ON_DEMAND); + struct ib_sge sge = {}; struct ib_mr *ib_mr; if (!rds_ibdev->odp_capable) { @@ -602,6 +603,14 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr->u.mr = ib_mr; ibmr->odp = 1; + + sge.addr = virt_addr; + sge.length = length; + sge.lkey = ib_mr->lkey; + + ib_advise_mr(rds_ibdev->pd, + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); return ibmr; } -- cgit v1.2.3 From 95f0ead8f04bec18e474594ef585f3734bd85b4c Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 19 Jan 2020 15:00:48 +0200 Subject: devlink: Add non-routable packet trap Add packet trap that can report packets that reached the router, but are non-routable. For example, IGMP queries can be flooded by the device in layer 2 and reach the router. Such packets should not be routed and instead dropped. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 6 ++++++ include/net/devlink.h | 3 +++ net/core/devlink.c | 1 + 3 files changed, 10 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 68245ea387ad..a62e1d6eb3e4 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -223,6 +223,12 @@ be added to the following table: * - ``ipv6_lpm_miss`` - ``exception`` - Traps unicast IPv6 packets that did not match any route + * - ``non_routable_packet`` + - ``drop`` + - Traps packets that the device decided to drop because they are not + supposed to be routed. For example, IGMP queries can be flooded by the + device in layer 2 and reach the router. Such packets should not be + routed and instead dropped Driver-specific Packet Traps ============================ diff --git a/include/net/devlink.h b/include/net/devlink.h index a6856f1d5d1f..08b757753e1c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -591,6 +591,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE, DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, + DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -659,6 +660,8 @@ enum devlink_trap_group_generic_id { "ipv4_lpm_miss" #define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \ "ipv6_lpm_miss" +#define DEVLINK_TRAP_GENERIC_NAME_NON_ROUTABLE \ + "non_routable_packet" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" diff --git a/net/core/devlink.c b/net/core/devlink.c index d30aa47052aa..c10e38d724bc 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7706,6 +7706,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(NON_ROUTABLE, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ -- cgit v1.2.3 From 13c056ec7d006b11557cebd9f1803edd646d2876 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 19 Jan 2020 15:00:54 +0200 Subject: devlink: Add tunnel generic packet traps Add packet traps that can report packets that were dropped during tunnel decapsulation. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 8 ++++++++ include/net/devlink.h | 6 ++++++ net/core/devlink.c | 2 ++ 3 files changed, 16 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index a62e1d6eb3e4..014f0a34c0e4 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -229,6 +229,11 @@ be added to the following table: supposed to be routed. For example, IGMP queries can be flooded by the device in layer 2 and reach the router. Such packets should not be routed and instead dropped + * - ``decap_error`` + - ``exception`` + - Traps NVE and IPinIP packets that the device decided to drop because of + failure during decapsulation (e.g., packet being too short, reserved + bits set in VXLAN header) Driver-specific Packet Traps ============================ @@ -265,6 +270,9 @@ narrow. The description of these groups must be added to the following table: * - ``buffer_drops`` - Contains packet traps for packets that were dropped by the device due to an enqueue decision + * - ``tunnel_drops`` + - Contains packet traps for packets that were dropped by the device during + tunnel encapsulation / decapsulation Testing ======= diff --git a/include/net/devlink.h b/include/net/devlink.h index 08b757753e1c..455282a4b714 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -592,6 +592,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, + DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -605,6 +606,7 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -662,6 +664,8 @@ enum devlink_trap_group_generic_id { "ipv6_lpm_miss" #define DEVLINK_TRAP_GENERIC_NAME_NON_ROUTABLE \ "non_routable_packet" +#define DEVLINK_TRAP_GENERIC_NAME_DECAP_ERROR \ + "decap_error" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -669,6 +673,8 @@ enum devlink_trap_group_generic_id { "l3_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \ "buffer_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \ + "tunnel_drops" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group, _metadata_cap) \ { \ diff --git a/net/core/devlink.c b/net/core/devlink.c index c10e38d724bc..af85fcd9b01e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7707,6 +7707,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(NON_ROUTABLE, DROP), + DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -7719,6 +7720,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), + DEVLINK_TRAP_GROUP(TUNNEL_DROPS), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From c3cae4916e57d2f0364d5e7769218547fb1b7c60 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 19 Jan 2020 15:00:58 +0200 Subject: devlink: Add overlay source MAC is multicast trap Add packet trap that can report NVE packets that the device decided to drop because their overlay source MAC is multicast. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 4 ++++ include/net/devlink.h | 3 +++ net/core/devlink.c | 1 + 3 files changed, 8 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 014f0a34c0e4..47a429bb8658 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -234,6 +234,10 @@ be added to the following table: - Traps NVE and IPinIP packets that the device decided to drop because of failure during decapsulation (e.g., packet being too short, reserved bits set in VXLAN header) + * - ``overlay_smac_is_mc`` + - ``drop`` + - Traps NVE packets that the device decided to drop because their overlay + source MAC is multicast Driver-specific Packet Traps ============================ diff --git a/include/net/devlink.h b/include/net/devlink.h index 455282a4b714..2813fd06ee89 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -593,6 +593,7 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, DEVLINK_TRAP_GENERIC_ID_NON_ROUTABLE, DEVLINK_TRAP_GENERIC_ID_DECAP_ERROR, + DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -666,6 +667,8 @@ enum devlink_trap_group_generic_id { "non_routable_packet" #define DEVLINK_TRAP_GENERIC_NAME_DECAP_ERROR \ "decap_error" +#define DEVLINK_TRAP_GENERIC_NAME_OVERLAY_SMAC_MC \ + "overlay_smac_is_mc" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" diff --git a/net/core/devlink.c b/net/core/devlink.c index af85fcd9b01e..e5b19bd2cbe2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7708,6 +7708,7 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(NON_ROUTABLE, DROP), DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), + DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ -- cgit v1.2.3 From 29237d22bc45d340792be217f40fb728ca48b6e6 Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Tue, 21 Jan 2020 01:04:46 +0100 Subject: net/smc: allow unprivileged users to read pnet table The current flags of the SMC_PNET_GET command only allow privileged users to retrieve entries from the pnet table via netlink. The content of the pnet table may be useful for all users though, e.g., for debugging smc connection problems. This patch removes the GENL_ADMIN_PERM flag so that unprivileged users can read the pnet table. Signed-off-by: Hans Wippel Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 82dedf052d86..2a5ed47c3e08 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -611,7 +611,7 @@ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, + /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start -- cgit v1.2.3 From 339821f7d2d890a61cebeb9d085643668d4df6e4 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 16:49:53 +0800 Subject: net/hsr: remove seq_nr_after_or_eq It's never used after introduced. So maybe better to remove. Signed-off-by: Alex Shi Cc: Arvid Brodin Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 27dc65d7de67..364ea2cc028e 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -35,7 +35,6 @@ static bool seq_nr_after(u16 a, u16 b) } #define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) -- cgit v1.2.3 From aeaec7bcebd9c85372191fb42a4478d21972d99e Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 21 Jan 2020 16:50:07 +0800 Subject: tcp/ipv4: remove AF_INET_FAMILY After commit 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") the macro isn't used anymore. remove it. Signed-off-by: Alex Shi Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6a691fd04398..a4db79b1b643 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -610,12 +610,6 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -#if IS_ENABLED(CONFIG_IPV6) -#define AF_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define AF_INET_FAMILY(fam) true -#endif - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, -- cgit v1.2.3 From 43a825afc91e2b06af1e8e7422198e759c2c5e20 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 20 Jan 2020 10:29:17 +0100 Subject: xsk, net: Make sock_def_readable() have external linkage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP sockets use the default implementation of struct sock's sk_data_ready callback, which is sock_def_readable(). This function is called in the XDP socket fast-path, and involves a retpoline. By letting sock_def_readable() have external linkage, and being called directly, the retpoline can be avoided. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200120092917.13949-1-bjorn.topel@gmail.com --- include/net/sock.h | 2 ++ net/core/sock.c | 2 +- net/xdp/xsk.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..0891c55f1e82 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2612,4 +2612,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) return false; } +void sock_def_readable(struct sock *sk); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 8459ad579f73..a4c8fac781ff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2786,7 +2786,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk) +void sock_def_readable(struct sock *sk) { struct socket_wq *wq; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 02ada7ab8c6e..df600487a68d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -217,7 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); - xs->sk.sk_data_ready(&xs->sk); + sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) -- cgit v1.2.3 From bdf2aca703e83eeecac2b492494687d5009a694e Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Wed, 22 Jan 2020 16:09:16 +0000 Subject: Bluetooth: adding missing const decoration to mgmt_status_table This change simply adds a missing const decoration to the mtmt_status_table definition. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 0dc610faab70..3c68a366977f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -176,7 +176,7 @@ static const u16 mgmt_untrusted_events[] = { "\x00\x00\x00\x00\x00\x00\x00\x00" /* HCI to MGMT error code conversion table */ -static u8 mgmt_status_table[] = { +static const u8 mgmt_status_table[] = { MGMT_STATUS_SUCCESS, MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */ MGMT_STATUS_NOT_CONNECTED, /* No Connection */ -- cgit v1.2.3 From 6613babaf662cfbd283fcd0abdd758e338dd181d Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Wed, 22 Jan 2020 19:47:44 +0000 Subject: Bluetooth: fix appearance typo in mgmt.c This change addresses a typo in the set_appearance handler. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3c68a366977f..3074363c68df 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3291,7 +3291,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_appearance *cp = data; - u16 apperance; + u16 appearance; int err; BT_DBG(""); @@ -3300,12 +3300,12 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, MGMT_STATUS_NOT_SUPPORTED); - apperance = le16_to_cpu(cp->appearance); + appearance = le16_to_cpu(cp->appearance); hci_dev_lock(hdev); - if (hdev->appearance != apperance) { - hdev->appearance = apperance; + if (hdev->appearance != appearance) { + hdev->appearance = appearance; if (hci_dev_test_flag(hdev, HCI_LE_ADV)) adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE); -- cgit v1.2.3 From 5576b991e9c1a11d2cc21c4b94fc75ec27603896 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Jan 2020 15:36:46 -0800 Subject: bpf: Add BPF_FUNC_jiffies64 This patch adds a helper to read the 64bit jiffies. It will be used in a later patch to implement the bpf_cubic.c. The helper is inlined for jit_requested and 64 BITS_PER_LONG as the map_gen_lookup(). Other cases could be considered together with map_gen_lookup() if needed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200122233646.903260-1-kafai@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ net/core/filter.c | 2 ++ 6 files changed, 48 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05d16615054c..a9687861fd7e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1414,6 +1414,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; +extern const struct bpf_func_proto bpf_jiffies64_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e81628eb059c..f1d74a2bd234 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2886,6 +2886,12 @@ union bpf_attr { * **-EPERM** if no permission to send the *sig*. * * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3005,7 +3011,8 @@ union bpf_attr { FN(probe_read_user_str), \ FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ - FN(send_signal_thread), + FN(send_signal_thread), \ + FN(jiffies64), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29d47aae0dd1..973a20d49749 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2137,6 +2137,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; +const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cada974c9f4e..d8b7b110a1c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -312,6 +313,17 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, preempt_enable(); } +BPF_CALL_0(bpf_jiffies64) +{ + return get_jiffies_64(); +} + +const struct bpf_func_proto bpf_jiffies64_proto = { + .func = bpf_jiffies64, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe94f64f0ec..734abaa02123 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9445,6 +9445,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_jiffies64) { + struct bpf_insn ld_jiffies_addr[2] = { + BPF_LD_IMM64(BPF_REG_0, + (unsigned long)&jiffies), + }; + + insn_buf[0] = ld_jiffies_addr[0]; + insn_buf[1] = ld_jiffies_addr[1]; + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, + BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/net/core/filter.c b/net/core/filter.c index 17de6747d9e3..4bf3e4aa8a7a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5923,6 +5923,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; default: return NULL; } -- cgit v1.2.3 From 84bf557fb02f5924c109a21a160ffc353d878487 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:24 +0530 Subject: net: sched: pie: move common code to pie.h This patch moves macros, structures and small functions common to PIE and FQ-PIE (to be added in a future commit) from the file net/sched/sch_pie.c to the header file include/net/pie.h. All the moved functions are made inline. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_pie.c | 86 +---------------------------------------------- 2 files changed, 97 insertions(+), 85 deletions(-) create mode 100644 include/net/pie.h (limited to 'net') diff --git a/include/net/pie.h b/include/net/pie.h new file mode 100644 index 000000000000..440213ec83eb --- /dev/null +++ b/include/net/pie.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __NET_SCHED_PIE_H +#define __NET_SCHED_PIE_H + +#include +#include +#include +#include +#include + +#define QUEUE_THRESHOLD 16384 +#define DQCOUNT_INVALID -1 +#define DTIME_INVALID 0xffffffffffffffff +#define MAX_PROB 0xffffffffffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { + psched_time_t target; /* user specified target delay in pschedtime */ + u32 tupdate; /* timer frequency (in jiffies) */ + u32 limit; /* number of packets that can be enqueued */ + u32 alpha; /* alpha and beta are between 0 and 32 */ + u32 beta; /* and are used for shift relative to 1 */ + bool ecn; /* true if ecn is enabled */ + bool bytemode; /* to scale drop early prob based on pkt size */ + u8 dq_rate_estimator; /* to calculate delay using Little's law */ +}; + +/* variables used */ +struct pie_vars { + u64 prob; /* probability but scaled by u64 limit. */ + psched_time_t burst_time; + psched_time_t qdelay; + psched_time_t qdelay_old; + u64 dq_count; /* measured in bytes */ + psched_time_t dq_tstamp; /* drain rate */ + u64 accu_prob; /* accumulated drop probability */ + u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ + u32 qlen_old; /* in bytes */ + u8 accu_prob_overflows; /* overflows of accu_prob */ +}; + +/* statistics gathering */ +struct pie_stats { + u32 packets_in; /* total number of packets enqueued */ + u32 dropped; /* packets dropped due to pie_action */ + u32 overlimit; /* dropped due to lack of space in queue */ + u32 maxq; /* maximum queue size */ + u32 ecn_mark; /* packets marked with ECN */ +}; + +/* private skb vars */ +struct pie_skb_cb { + psched_time_t enqueue_time; +}; + +static inline void pie_params_init(struct pie_params *params) +{ + params->alpha = 2; + params->beta = 20; + params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ + params->limit = 1000; /* default of 1000 packets */ + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ + params->ecn = false; + params->bytemode = false; + params->dq_rate_estimator = false; +} + +static inline void pie_vars_init(struct pie_vars *vars) +{ + vars->dq_count = DQCOUNT_INVALID; + vars->dq_tstamp = DTIME_INVALID; + vars->accu_prob = 0; + vars->avg_dq_rate = 0; + /* default of 150 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); + vars->accu_prob_overflows = 0; +} + +static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); + return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) +{ + return get_pie_cb(skb)->enqueue_time; +} + +static inline void pie_set_enqueue_time(struct sk_buff *skb) +{ + get_pie_cb(skb)->enqueue_time = psched_get_time(); +} + +#endif diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b0b0dc46af61..7197bcaa14ba 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -19,47 +19,7 @@ #include #include #include - -#define QUEUE_THRESHOLD 16384 -#define DQCOUNT_INVALID -1 -#define DTIME_INVALID 0xffffffffffffffff -#define MAX_PROB 0xffffffffffffffff -#define PIE_SCALE 8 - -/* parameters used */ -struct pie_params { - psched_time_t target; /* user specified target delay in pschedtime */ - u32 tupdate; /* timer frequency (in jiffies) */ - u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between 0 and 32 */ - u32 beta; /* and are used for shift relative to 1 */ - bool ecn; /* true if ecn is enabled */ - bool bytemode; /* to scale drop early prob based on pkt size */ - u8 dq_rate_estimator; /* to calculate delay using Little's law */ -}; - -/* variables used */ -struct pie_vars { - u64 prob; /* probability but scaled by u64 limit. */ - psched_time_t burst_time; - psched_time_t qdelay; - psched_time_t qdelay_old; - u64 dq_count; /* measured in bytes */ - psched_time_t dq_tstamp; /* drain rate */ - u64 accu_prob; /* accumulated drop probability */ - u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ - u32 qlen_old; /* in bytes */ - u8 accu_prob_overflows; /* overflows of accu_prob */ -}; - -/* statistics gathering */ -struct pie_stats { - u32 packets_in; /* total number of packets enqueued */ - u32 dropped; /* packets dropped due to pie_action */ - u32 overlimit; /* dropped due to lack of space in queue */ - u32 maxq; /* maximum queue size */ - u32 ecn_mark; /* packets marked with ECN */ -}; +#include /* private data for the Qdisc */ struct pie_sched_data { @@ -70,50 +30,6 @@ struct pie_sched_data { struct Qdisc *sch; }; -static void pie_params_init(struct pie_params *params) -{ - params->alpha = 2; - params->beta = 20; - params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ - params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ - params->ecn = false; - params->bytemode = false; - params->dq_rate_estimator = false; -} - -/* private skb vars */ -struct pie_skb_cb { - psched_time_t enqueue_time; -}; - -static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb)); - return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data; -} - -static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb) -{ - return get_pie_cb(skb)->enqueue_time; -} - -static void pie_set_enqueue_time(struct sk_buff *skb) -{ - get_pie_cb(skb)->enqueue_time = psched_get_time(); -} - -static void pie_vars_init(struct pie_vars *vars) -{ - vars->dq_count = DQCOUNT_INVALID; - vars->dq_tstamp = DTIME_INVALID; - vars->accu_prob = 0; - vars->avg_dq_rate = 0; - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); - vars->accu_prob_overflows = 0; -} - static bool drop_early(struct Qdisc *sch, u32 packet_size) { struct pie_sched_data *q = qdisc_priv(sch); -- cgit v1.2.3 From 2dfb1952a9a1fde0b515f58605c11902e69415bf Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:28 +0530 Subject: pie: rearrange structure members and their initializations Rearrange the members of the structure such that closely referenced members appear together and/or fit in the same cacheline. Also, change the order of their initializations to match the order in which they appear in the structure. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 20 ++++++++++---------- net/sched/sch_pie.c | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/pie.h b/include/net/pie.h index f9c6a44bdb0c..ec0fbe98ec2f 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -28,13 +28,13 @@ struct pie_params { /* variables used */ struct pie_vars { - u64 prob; /* probability but scaled by u64 limit. */ - psched_time_t burst_time; psched_time_t qdelay; psched_time_t qdelay_old; - u64 dq_count; /* measured in bytes */ + psched_time_t burst_time; psched_time_t dq_tstamp; /* drain rate */ + u64 prob; /* probability but scaled by u64 limit. */ u64 accu_prob; /* accumulated drop probability */ + u64 dq_count; /* measured in bytes */ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ u32 qlen_old; /* in bytes */ u8 accu_prob_overflows; /* overflows of accu_prob */ @@ -45,8 +45,8 @@ struct pie_stats { u32 packets_in; /* total number of packets enqueued */ u32 dropped; /* packets dropped due to pie_action */ u32 overlimit; /* dropped due to lack of space in queue */ - u32 maxq; /* maximum queue size */ u32 ecn_mark; /* packets marked with ECN */ + u32 maxq; /* maximum queue size */ }; /* private skb vars */ @@ -56,11 +56,11 @@ struct pie_skb_cb { static inline void pie_params_init(struct pie_params *params) { - params->alpha = 2; - params->beta = 20; + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ + params->alpha = 2; + params->beta = 20; params->ecn = false; params->bytemode = false; params->dq_rate_estimator = false; @@ -68,12 +68,12 @@ static inline void pie_params_init(struct pie_params *params) static inline void pie_vars_init(struct pie_vars *vars) { - vars->dq_count = DQCOUNT_INVALID; + /* default of 150 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; + vars->dq_count = DQCOUNT_INVALID; vars->avg_dq_rate = 0; - /* default of 150 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); vars->accu_prob_overflows = 0; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 7197bcaa14ba..0c583cc148f3 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -23,8 +23,8 @@ /* private data for the Qdisc */ struct pie_sched_data { - struct pie_params params; struct pie_vars vars; + struct pie_params params; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; -- cgit v1.2.3 From 55f780c4a6c3046461352f4081427b077f8d06ed Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:30 +0530 Subject: net: sched: pie: fix commenting Fix punctuation and logical mistakes in the comments. The logical mistake was that "dequeue_rate" is no longer the default way to calculate queuing delay and is not needed. The default way to calculate queue delay was changed in commit cec2975f2b70 ("net: sched: pie: enable timestamp based delay calculation"). Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- net/sched/sch_pie.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 0c583cc148f3..024f55569a38 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -248,10 +248,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) q->vars.dq_count = 0; } - /* Calculate the average drain rate from this value. If queue length - * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset + /* Calculate the average drain rate from this value. If queue length + * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset * the dq_count to -1 as we don't have enough packets to calculate the - * drain rate anymore The following if block is entered only when we + * drain rate anymore. The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in @@ -329,8 +329,8 @@ static void calculate_probability(struct Qdisc *sch) qdelay_old = q->vars.qdelay_old; } - /* If qdelay is zero and qlen is not, it means qlen is very small, less - * than dequeue_rate, so we do not update probabilty in this round + /* If qdelay is zero and qlen is not, it means qlen is very small, + * so we do not update probabilty in this round. */ if (qdelay == 0 && qlen != 0) update_prob = false; -- cgit v1.2.3 From 00ea2fb7274f568cd982a5958c66cab578aada25 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:31 +0530 Subject: net: sched: pie: fix alignment in struct instances Make the alignment in the initialization of the struct instances consistent in the file. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- net/sched/sch_pie.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 024f55569a38..c65164659bca 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -132,14 +132,14 @@ out: } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { - [TCA_PIE_TARGET] = {.type = NLA_U32}, - [TCA_PIE_LIMIT] = {.type = NLA_U32}, - [TCA_PIE_TUPDATE] = {.type = NLA_U32}, - [TCA_PIE_ALPHA] = {.type = NLA_U32}, - [TCA_PIE_BETA] = {.type = NLA_U32}, - [TCA_PIE_ECN] = {.type = NLA_U32}, - [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, - [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, + [TCA_PIE_TARGET] = {.type = NLA_U32}, + [TCA_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_PIE_BETA] = {.type = NLA_U32}, + [TCA_PIE_ECN] = {.type = NLA_U32}, + [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, @@ -549,7 +549,7 @@ static void pie_destroy(struct Qdisc *sch) } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { - .id = "pie", + .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, -- cgit v1.2.3 From 5205ea00cda1ac23cebfb97dfccca84722d58dfe Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:32 +0530 Subject: net: sched: pie: export symbols to be reused by FQ-PIE This patch makes the drop_early(), calculate_probability() and pie_process_dequeue() functions generic enough to be used by both PIE and FQ-PIE (to be added in a future commit). The major change here is in the way the functions take in arguments. This patch exports these functions and makes FQ-PIE dependent on sch_pie. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 9 +++ net/sched/sch_pie.c | 173 ++++++++++++++++++++++++++-------------------------- 2 files changed, 97 insertions(+), 85 deletions(-) (limited to 'net') diff --git a/include/net/pie.h b/include/net/pie.h index 51a1984c2dce..90f5db3d29e7 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -124,4 +124,13 @@ static inline void pie_set_enqueue_time(struct sk_buff *skb) get_pie_cb(skb)->enqueue_time = psched_get_time(); } +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size); + +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen); + +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen); + #endif diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index c65164659bca..915bcdb59a9f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -30,64 +30,65 @@ struct pie_sched_data { struct Qdisc *sch; }; -static bool drop_early(struct Qdisc *sch, u32 packet_size) +bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, + struct pie_vars *vars, u32 qlen, u32 packet_size) { - struct pie_sched_data *q = qdisc_priv(sch); u64 rnd; - u64 local_prob = q->vars.prob; + u64 local_prob = vars->prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ - if (q->vars.burst_time > 0) + if (vars->burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.prob < MAX_PROB / 5)) + if ((vars->qdelay < params->target / 2) && + (vars->prob < MAX_PROB / 5)) return false; - /* If we have fewer than 2 mtu-sized packets, disable drop_early, + /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ - if (sch->qstats.backlog < 2 * mtu) + if (qlen < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ - if (q->params.bytemode && packet_size <= mtu) + if (params->bytemode && packet_size <= mtu) local_prob = (u64)packet_size * div_u64(local_prob, mtu); else - local_prob = q->vars.prob; + local_prob = vars->prob; if (local_prob == 0) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; } - if (local_prob > MAX_PROB - q->vars.accu_prob) - q->vars.accu_prob_overflows++; + if (local_prob > MAX_PROB - vars->accu_prob) + vars->accu_prob_overflows++; - q->vars.accu_prob += local_prob; + vars->accu_prob += local_prob; - if (q->vars.accu_prob_overflows == 0 && - q->vars.accu_prob < (MAX_PROB / 100) * 85) + if (vars->accu_prob_overflows == 0 && + vars->accu_prob < (MAX_PROB / 100) * 85) return false; - if (q->vars.accu_prob_overflows == 8 && - q->vars.accu_prob >= MAX_PROB / 2) + if (vars->accu_prob_overflows == 8 && + vars->accu_prob >= MAX_PROB / 2) return true; prandom_bytes(&rnd, 8); if (rnd < local_prob) { - q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; + vars->accu_prob = 0; + vars->accu_prob_overflows = 0; return true; } return false; } +EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) @@ -100,7 +101,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } - if (!drop_early(sch, skb->len)) { + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, + skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { @@ -212,26 +214,25 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, return 0; } -static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, + struct pie_vars *vars, u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - int qlen = sch->qstats.backlog; /* current queue size in bytes */ psched_time_t now = psched_get_time(); u32 dtime = 0; /* If dq_rate_estimator is disabled, calculate qdelay using the * packet timestamp. */ - if (!q->params.dq_rate_estimator) { - q->vars.qdelay = now - pie_get_enqueue_time(skb); + if (!params->dq_rate_estimator) { + vars->qdelay = now - pie_get_enqueue_time(skb); - if (q->vars.dq_tstamp != DTIME_INVALID) - dtime = now - q->vars.dq_tstamp; + if (vars->dq_tstamp != DTIME_INVALID) + dtime = now - vars->dq_tstamp; - q->vars.dq_tstamp = now; + vars->dq_tstamp = now; if (qlen == 0) - q->vars.qdelay = 0; + vars->qdelay = 0; if (dtime == 0) return; @@ -243,9 +244,9 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ - if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { - q->vars.dq_tstamp = psched_get_time(); - q->vars.dq_count = 0; + if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { + vars->dq_tstamp = psched_get_time(); + vars->dq_count = 0; } /* Calculate the average drain rate from this value. If queue length @@ -257,25 +258,25 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ - if (q->vars.dq_count != DQCOUNT_INVALID) { - q->vars.dq_count += skb->len; + if (vars->dq_count != DQCOUNT_INVALID) { + vars->dq_count += skb->len; - if (q->vars.dq_count >= QUEUE_THRESHOLD) { - u32 count = q->vars.dq_count << PIE_SCALE; + if (vars->dq_count >= QUEUE_THRESHOLD) { + u32 count = vars->dq_count << PIE_SCALE; - dtime = now - q->vars.dq_tstamp; + dtime = now - vars->dq_tstamp; if (dtime == 0) return; count = count / dtime; - if (q->vars.avg_dq_rate == 0) - q->vars.avg_dq_rate = count; + if (vars->avg_dq_rate == 0) + vars->avg_dq_rate = count; else - q->vars.avg_dq_rate = - (q->vars.avg_dq_rate - - (q->vars.avg_dq_rate >> 3)) + (count >> 3); + vars->avg_dq_rate = + (vars->avg_dq_rate - + (vars->avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset @@ -283,10 +284,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) * packet is dequeued */ if (qlen < QUEUE_THRESHOLD) { - q->vars.dq_count = DQCOUNT_INVALID; + vars->dq_count = DQCOUNT_INVALID; } else { - q->vars.dq_count = 0; - q->vars.dq_tstamp = psched_get_time(); + vars->dq_count = 0; + vars->dq_tstamp = psched_get_time(); } goto burst_allowance_reduction; @@ -296,18 +297,18 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) return; burst_allowance_reduction: - if (q->vars.burst_time > 0) { - if (q->vars.burst_time > dtime) - q->vars.burst_time -= dtime; + if (vars->burst_time > 0) { + if (vars->burst_time > dtime) + vars->burst_time -= dtime; else - q->vars.burst_time = 0; + vars->burst_time = 0; } } +EXPORT_SYMBOL_GPL(pie_process_dequeue); -static void calculate_probability(struct Qdisc *sch) +void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, + u32 qlen) { - struct pie_sched_data *q = qdisc_priv(sch); - u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ @@ -316,17 +317,17 @@ static void calculate_probability(struct Qdisc *sch) u32 power; bool update_prob = true; - if (q->params.dq_rate_estimator) { - qdelay_old = q->vars.qdelay; - q->vars.qdelay_old = q->vars.qdelay; + if (params->dq_rate_estimator) { + qdelay_old = vars->qdelay; + vars->qdelay_old = vars->qdelay; - if (q->vars.avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; + if (vars->avg_dq_rate > 0) + qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate; else qdelay = 0; } else { - qdelay = q->vars.qdelay; - qdelay_old = q->vars.qdelay_old; + qdelay = vars->qdelay; + qdelay_old = vars->qdelay_old; } /* If qdelay is zero and qlen is not, it means qlen is very small, @@ -342,18 +343,18 @@ static void calculate_probability(struct Qdisc *sch) * probability. alpha/beta are updated locally below by scaling down * by 16 to come to 0-2 range. */ - alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; /* We scale alpha and beta differently depending on how heavy the * congestion is. Please see RFC 8033 for details. */ - if (q->vars.prob < MAX_PROB / 10) { + if (vars->prob < MAX_PROB / 10) { alpha >>= 1; beta >>= 1; power = 100; - while (q->vars.prob < div_u64(MAX_PROB, power) && + while (vars->prob < div_u64(MAX_PROB, power) && power <= 1000000) { alpha >>= 2; beta >>= 2; @@ -362,14 +363,14 @@ static void calculate_probability(struct Qdisc *sch) } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * (u64)(qdelay - q->params.target); + delta += alpha * (u64)(qdelay - params->target); delta += beta * (u64)(qdelay - qdelay_old); - oldprob = q->vars.prob; + oldprob = vars->prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s64)(MAX_PROB / (100 / 2)) && - q->vars.prob >= MAX_PROB / 10) + vars->prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: @@ -380,12 +381,12 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - q->vars.prob += delta; + vars->prob += delta; if (delta > 0) { /* prevent overflow */ - if (q->vars.prob < oldprob) { - q->vars.prob = MAX_PROB; + if (vars->prob < oldprob) { + vars->prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next @@ -395,8 +396,8 @@ static void calculate_probability(struct Qdisc *sch) } } else { /* prevent underflow */ - if (q->vars.prob > oldprob) - q->vars.prob = 0; + if (vars->prob > oldprob) + vars->prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if @@ -405,10 +406,10 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - q->vars.prob -= q->vars.prob / 64u; + vars->prob -= vars->prob / 64; - q->vars.qdelay = qdelay; - q->vars.qlen_old = qlen; + vars->qdelay = qdelay; + vars->qlen_old = qlen; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods @@ -416,16 +417,17 @@ static void calculate_probability(struct Qdisc *sch) * 3. If average dq_rate_estimator is enabled, we have atleast one * estimate for the avg_dq_rate ie., is a non-zero value */ - if ((q->vars.qdelay < q->params.target / 2) && - (q->vars.qdelay_old < q->params.target / 2) && - q->vars.prob == 0 && - (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) { - pie_vars_init(&q->vars); + if ((vars->qdelay < params->target / 2) && + (vars->qdelay_old < params->target / 2) && + vars->prob == 0 && + (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) { + pie_vars_init(vars); } - if (!q->params.dq_rate_estimator) - q->vars.qdelay_old = qdelay; + if (!params->dq_rate_estimator) + vars->qdelay_old = qdelay; } +EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { @@ -434,7 +436,7 @@ static void pie_timer(struct timer_list *t) spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - calculate_probability(sch); + pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) @@ -523,12 +525,13 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { + struct pie_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; - pie_process_dequeue(sch, skb); + pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog); return skb; } -- cgit v1.2.3 From ec97ecf1ebe485a17cd8395a5f35e6b80b57665a Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Wed, 22 Jan 2020 23:52:33 +0530 Subject: net: sched: add Flow Queue PIE packet scheduler Principles: - Packets are classified on flows. - This is a Stochastic model (as we use a hash, several flows might be hashed to the same slot) - Each flow has a PIE managed queue. - Flows are linked onto two (Round Robin) lists, so that new flows have priority on old ones. - For a given flow, packets are not reordered. - Drops during enqueue only. - ECN capability is off by default. - ECN threshold (if ECN is enabled) is at 10% by default. - Uses timestamps to calculate queue delay by default. Usage: tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ] [ target TIME ] [ tupdate TIME ] [ alpha NUMBER ] [ beta NUMBER ] [ quantum BYTES ] [ memory_limit BYTES ] [ ecnprob PERCENTAGE ] [ [no]ecn ] [ [no]bytemode ] [ [no_]dq_rate_estimator ] defaults: limit: 10240 packets, flows: 1024 target: 15 ms, tupdate: 15 ms (in jiffies) alpha: 1/8, beta : 5/4 quantum: device MTU, memory_limit: 32 Mb ecnprob: 10%, ecn: off bytemode: off, dq_rate_estimator: off Signed-off-by: Mohit P. Tahiliani Signed-off-by: Sachin D. Patil Signed-off-by: V. Saicharan Signed-off-by: Mohit Bhasi Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: David S. Miller --- include/net/pie.h | 2 + include/uapi/linux/pkt_sched.h | 31 +++ net/sched/Kconfig | 13 + net/sched/Makefile | 1 + net/sched/sch_fq_pie.c | 562 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 609 insertions(+) create mode 100644 net/sched/sch_fq_pie.c (limited to 'net') diff --git a/include/net/pie.h b/include/net/pie.h index 90f5db3d29e7..fd5a37cb7993 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -81,9 +81,11 @@ struct pie_stats { /** * struct pie_skb_cb - contains private skb vars * @enqueue_time: timestamp when the packet is enqueued + * @mem_usage: size of the skb during enqueue */ struct pie_skb_cb { psched_time_t enqueue_time; + u32 mem_usage; }; static inline void pie_params_init(struct pie_params *params) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bf5a5b1dfb0b..bbe791b24168 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -971,6 +971,37 @@ struct tc_pie_xstats { __u32 ecn_mark; /* packets marked with ecn*/ }; +/* FQ PIE */ +enum { + TCA_FQ_PIE_UNSPEC, + TCA_FQ_PIE_LIMIT, + TCA_FQ_PIE_FLOWS, + TCA_FQ_PIE_TARGET, + TCA_FQ_PIE_TUPDATE, + TCA_FQ_PIE_ALPHA, + TCA_FQ_PIE_BETA, + TCA_FQ_PIE_QUANTUM, + TCA_FQ_PIE_MEMORY_LIMIT, + TCA_FQ_PIE_ECN_PROB, + TCA_FQ_PIE_ECN, + TCA_FQ_PIE_BYTEMODE, + TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + __TCA_FQ_PIE_MAX +}; +#define TCA_FQ_PIE_MAX (__TCA_FQ_PIE_MAX - 1) + +struct tc_fq_pie_xstats { + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to fq_pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 overmemory; /* dropped due to lack of memory in queue */ + __u32 ecn_mark; /* packets marked with ecn */ + __u32 new_flow_count; /* count of new flows created by packets */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 memory_usage; /* total memory across all queues */ +}; + /* CBS */ struct tc_cbs_qopt { __u8 offload; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b1e7ec726958..edde0e519438 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -366,6 +366,19 @@ config NET_SCH_PIE If unsure, say N. +config NET_SCH_FQ_PIE + depends on NET_SCH_PIE + tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)" + help + Say Y here if you want to use the Flow Queue Proportional Integral + controller Enhanced (FQ-PIE) packet scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc8033 + + To compile this driver as a module, choose M here: the module + will be called sch_fq_pie. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index bc8856b865ff..31c367a6cd09 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c new file mode 100644 index 000000000000..bbd0dea6b6b9 --- /dev/null +++ b/net/sched/sch_fq_pie.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Flow Queue PIE discipline + * + * Copyright (C) 2019 Mohit P. Tahiliani + * Copyright (C) 2019 Sachin D. Patil + * Copyright (C) 2019 V. Saicharan + * Copyright (C) 2019 Mohit Bhasi + * Copyright (C) 2019 Leslie Monis + * Copyright (C) 2019 Gautam Ramakrishnan + */ + +#include +#include +#include +#include +#include + +/* Flow Queue PIE + * + * Principles: + * - Packets are classified on flows. + * - This is a Stochastic model (as we use a hash, several flows might + * be hashed to the same slot) + * - Each flow has a PIE managed queue. + * - Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * - For a given flow, packets are not reordered. + * - Drops during enqueue only. + * - ECN capability is off by default. + * - ECN threshold (if ECN is enabled) is at 10% by default. + * - Uses timestamps to calculate queue delay by default. + */ + +/** + * struct fq_pie_flow - contains data for each flow + * @vars: pie vars associated with the flow + * @deficit: number of remaining byte credits + * @backlog: size of data in the flow + * @qlen: number of packets in the flow + * @flowchain: flowchain for the flow + * @head: first packet in the flow + * @tail: last packet in the flow + */ +struct fq_pie_flow { + struct pie_vars vars; + s32 deficit; + u32 backlog; + u32 qlen; + struct list_head flowchain; + struct sk_buff *head; + struct sk_buff *tail; +}; + +struct fq_pie_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct fq_pie_flow *flows; + struct Qdisc *sch; + struct list_head old_flows; + struct list_head new_flows; + struct pie_params p_params; + u32 ecn_prob; + u32 flows_cnt; + u32 quantum; + u32 memory_limit; + u32 new_flow_count; + u32 memory_usage; + u32 overmemory; + struct pie_stats stats; + struct timer_list adapt_timer; +}; + +static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q, + struct sk_buff *skb) +{ + return reciprocal_scale(skb_get_hash(skb), q->flows_cnt); +} + +static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + return fq_pie_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_pie_flow *flow, + struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct fq_pie_flow *sel_flow; + int uninitialized_var(ret); + u8 memory_limited = false; + u8 enqueue = false; + u32 pkt_len; + u32 idx; + + /* Classifies packet into corresponding flow */ + idx = fq_pie_classify(skb, sch, &ret); + sel_flow = &q->flows[idx]; + + /* Checks whether adding a new packet would exceed memory limit */ + get_pie_cb(skb)->mem_usage = skb->truesize; + memory_limited = q->memory_usage > q->memory_limit + skb->truesize; + + /* Checks if the qdisc is full */ + if (unlikely(qdisc_qlen(sch) >= sch->limit)) { + q->stats.overlimit++; + goto out; + } else if (unlikely(memory_limited)) { + q->overmemory++; + } + + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, + sel_flow->backlog, skb->len)) { + enqueue = true; + } else if (q->p_params.ecn && + sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob && + INET_ECN_set_ce(skb)) { + /* If packet is ecn capable, mark it if drop probability + * is lower than the parameter ecn_prob, else drop it. + */ + q->stats.ecn_mark++; + enqueue = true; + } + if (enqueue) { + /* Set enqueue time only when dq_rate_estimator is disabled. */ + if (!q->p_params.dq_rate_estimator) + pie_set_enqueue_time(skb); + + pkt_len = qdisc_pkt_len(skb); + q->stats.packets_in++; + q->memory_usage += skb->truesize; + sch->qstats.backlog += pkt_len; + sch->q.qlen++; + flow_queue_add(sel_flow, skb); + if (list_empty(&sel_flow->flowchain)) { + list_add_tail(&sel_flow->flowchain, &q->new_flows); + q->new_flow_count++; + sel_flow->deficit = q->quantum; + sel_flow->qlen = 0; + sel_flow->backlog = 0; + } + sel_flow->qlen++; + sel_flow->backlog += pkt_len; + return NET_XMIT_SUCCESS; + } +out: + q->stats.dropped++; + sel_flow->vars.accu_prob = 0; + sel_flow->vars.accu_prob_overflows = 0; + __qdisc_drop(skb, to_free); + qdisc_qstats_drop(sch); + return NET_XMIT_CN; +} + +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = { + [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32}, + [TCA_FQ_PIE_TARGET] = {.type = NLA_U32}, + [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32}, + [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32}, + [TCA_FQ_PIE_BETA] = {.type = NLA_U32}, + [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32}, + [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}, + [TCA_FQ_PIE_ECN] = {.type = NLA_U32}, + [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32}, + [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, +}; + +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = NULL; + struct fq_pie_flow *flow; + struct list_head *head; + u32 pkt_len; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + + flow = list_first_entry(head, struct fq_pie_flow, flowchain); + /* Flow has exhausted all its credits */ + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + if (flow->head) { + skb = dequeue_head(flow); + pkt_len = qdisc_pkt_len(skb); + sch->qstats.backlog -= pkt_len; + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + } + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if (head == &q->new_flows && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + + flow->qlen--; + flow->deficit -= pkt_len; + flow->backlog -= pkt_len; + q->memory_usage -= get_pie_cb(skb)->mem_usage; + pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog); + return skb; +} + +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; + unsigned int len_dropped = 0; + unsigned int num_dropped = 0; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); + if (err < 0) + return err; + + sch_tree_lock(sch); + if (tb[TCA_FQ_PIE_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); + + q->p_params.limit = limit; + sch->limit = limit; + } + if (tb[TCA_FQ_PIE_FLOWS]) { + if (q->flows) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows cannot be changed"); + goto flow_error; + } + q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); + if (!q->flows_cnt || q->flows_cnt > 65536) { + NL_SET_ERR_MSG_MOD(extack, + "Number of flows must be < 65536"); + goto flow_error; + } + } + + /* convert from microseconds to pschedtime */ + if (tb[TCA_FQ_PIE_TARGET]) { + /* target is in us */ + u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); + + /* convert to pschedtime */ + q->p_params.target = + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + } + + /* tupdate is in jiffies */ + if (tb[TCA_FQ_PIE_TUPDATE]) + q->p_params.tupdate = + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + + if (tb[TCA_FQ_PIE_ALPHA]) + q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + + if (tb[TCA_FQ_PIE_BETA]) + q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + + if (tb[TCA_FQ_PIE_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + + if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) + q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + + if (tb[TCA_FQ_PIE_ECN_PROB]) + q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + + if (tb[TCA_FQ_PIE_ECN]) + q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + + if (tb[TCA_FQ_PIE_BYTEMODE]) + q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + + if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) + q->p_params.dq_rate_estimator = + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + + /* Drop excess packets if new limit is lower */ + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + + kfree_skb(skb); + len_dropped += qdisc_pkt_len(skb); + num_dropped += 1; + } + qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + + sch_tree_unlock(sch); + return 0; + +flow_error: + sch_tree_unlock(sch); + return -EINVAL; +} + +static void fq_pie_timer(struct timer_list *t) +{ + struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + u16 idx; + + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + for (idx = 0; idx < q->flows_cnt; idx++) + pie_calculate_probability(&q->p_params, &q->flows[idx].vars, + q->flows[idx].backlog); + + /* reset the timer to fire after 'tupdate' jiffies. */ + if (q->p_params.tupdate) + mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate); + + spin_unlock(root_lock); +} + +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + int err; + u16 idx; + + pie_params_init(&q->p_params); + sch->limit = 10 * 1024; + q->p_params.limit = sch->limit; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->sch = sch; + q->ecn_prob = 10; + q->flows_cnt = 1024; + q->memory_limit = SZ_32M; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + + if (opt) { + err = fq_pie_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + goto init_failure; + + q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow), + GFP_KERNEL); + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + timer_setup(&q->adapt_timer, fq_pie_timer, 0); + mod_timer(&q->adapt_timer, jiffies + HZ / 2); + + return 0; + +init_failure: + q->flows_cnt = 0; + + return err; +} + +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + return -EMSGSIZE; + + /* convert target from pschedtime to us */ + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + nla_put_u32(skb, TCA_FQ_PIE_TARGET, + ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + NSEC_PER_USEC) || + nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, + jiffies_to_usecs(q->p_params.tupdate)) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + q->p_params.dq_rate_estimator)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + struct tc_fq_pie_xstats st = { + .packets_in = q->stats.packets_in, + .overlimit = q->stats.overlimit, + .overmemory = q->overmemory, + .dropped = q->stats.dropped, + .ecn_mark = q->stats.ecn_mark, + .new_flow_count = q->new_flow_count, + .memory_usage = q->memory_usage, + }; + struct list_head *pos; + + sch_tree_lock(sch); + list_for_each(pos, &q->new_flows) + st.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.old_flows_len++; + sch_tree_unlock(sch); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void fq_pie_reset(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + u16 idx; + + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + for (idx = 0; idx < q->flows_cnt; idx++) { + struct fq_pie_flow *flow = q->flows + idx; + + /* Removes all packets from flow */ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + + INIT_LIST_HEAD(&flow->flowchain); + pie_vars_init(&flow->vars); + } + + sch->q.qlen = 0; + sch->qstats.backlog = 0; +} + +static void fq_pie_destroy(struct Qdisc *sch) +{ + struct fq_pie_sched_data *q = qdisc_priv(sch); + + tcf_block_put(q->block); + del_timer_sync(&q->adapt_timer); + kvfree(q->flows); +} + +static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = { + .id = "fq_pie", + .priv_size = sizeof(struct fq_pie_sched_data), + .enqueue = fq_pie_qdisc_enqueue, + .dequeue = fq_pie_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_pie_init, + .destroy = fq_pie_destroy, + .reset = fq_pie_reset, + .change = fq_pie_change, + .dump = fq_pie_dump, + .dump_stats = fq_pie_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_pie_module_init(void) +{ + return register_qdisc(&fq_pie_qdisc_ops); +} + +static void __exit fq_pie_module_exit(void) +{ + unregister_qdisc(&fq_pie_qdisc_ops); +} + +module_init(fq_pie_module_init); +module_exit(fq_pie_module_exit); + +MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"); +MODULE_AUTHOR("Mohit P. Tahiliani"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From ac0e932d0e2988c125be251715d83e95839cdae3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 24 Jan 2020 13:40:19 +0200 Subject: net: bridge: check port state before br_allowed_egress If we make sure that br_allowed_egress is called only when we have BR_STATE_FORWARDING state then we can avoid a test later when we add per-vlan state. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 86637000f275..7629b63f6f30 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && + p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } -- cgit v1.2.3 From 7a53e718c551752924416b33235bf338729b5115 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 24 Jan 2020 13:40:20 +0200 Subject: net: bridge: vlan: add basic option dumping support We'll be dumping the options for the whole range if they're equal. The first range vlan will be used to extract the options. The commit doesn't change anything yet it just adds the skeleton for the support. The dump will happen when the first option is added. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 +- net/bridge/br_private.h | 8 ++++++++ net/bridge/br_vlan.c | 20 ++++++++++++++------ net/bridge/br_vlan_options.c | 25 +++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 net/bridge/br_vlan_options.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index ac9ef337f0fa..49da7ae6f077 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o -bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o +bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a6226ff2f0cc..403df71d2cfa 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1191,6 +1191,14 @@ static inline void br_vlan_notify(const struct net_bridge *br, } #endif +/* br_vlan_options.c */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, + const struct net_bridge_vlan *v2); +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); +size_t br_vlan_opts_nl_size(void); +#endif + struct nf_br_ops { int (*br_dev_xmit_hook)(struct sk_buff *skb); }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index e4f7dd10c3f8..75ec3da92b0b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1547,7 +1547,9 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) } } +/* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts, u16 flags) { struct bridge_vlan_info info; @@ -1572,6 +1574,9 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) goto out_err; + if (v_opts && !br_vlan_opts_fill(skb, v_opts)) + goto out_err; + nla_nest_end(skb, nest); return true; @@ -1586,7 +1591,8 @@ static size_t rtnl_vlan_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ - + nla_total_size(sizeof(struct bridge_vlan_info)); /* BRIDGE_VLANDB_ENTRY_INFO */ + + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + + br_vlan_opts_nl_size(); /* bridge vlan options */ } void br_vlan_notify(const struct net_bridge *br, @@ -1595,7 +1601,7 @@ void br_vlan_notify(const struct net_bridge *br, int cmd) { struct net_bridge_vlan_group *vg; - struct net_bridge_vlan *v; + struct net_bridge_vlan *v = NULL; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; @@ -1647,7 +1653,7 @@ void br_vlan_notify(const struct net_bridge *br, goto out_kfree; } - if (!br_vlan_fill_vids(skb, vid, vid_range, flags)) + if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags)) goto out_err; nlmsg_end(skb, nlh); @@ -1665,7 +1671,8 @@ static bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return v_curr->vid - range_end->vid == 1 && - range_end->flags == v_curr->flags; + range_end->flags == v_curr->flags && + br_vlan_opts_eq(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, @@ -1729,7 +1736,8 @@ static int br_vlan_dump_dev(const struct net_device *dev, u16 flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, - range_end->vid, flags)) { + range_end->vid, range_start, + flags)) { err = -EMSGSIZE; break; } @@ -1748,7 +1756,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, */ if (!err && range_start && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, - br_vlan_flags(range_start, pvid))) + range_start, br_vlan_flags(range_start, pvid))) err = -EMSGSIZE; cb->args[1] = err ? idx : 0; diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c new file mode 100644 index 000000000000..55fcdc9c380c --- /dev/null +++ b/net/bridge/br_vlan_options.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020, Nikolay Aleksandrov +#include +#include +#include +#include + +#include "br_private.h" + +/* check if the options between two vlans are equal */ +bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, + const struct net_bridge_vlan *v2) +{ + return true; +} + +bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) +{ + return true; +} + +size_t br_vlan_opts_nl_size(void) +{ + return 0; +} -- cgit v1.2.3 From a5d29ae226812ae620f58a0726525d8af9b1d751 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 24 Jan 2020 13:40:21 +0200 Subject: net: bridge: vlan: add basic option setting support This patch adds support for option modification of single vlans and ranges. It allows to only modify options, i.e. skip create/delete by using the BRIDGE_VLAN_INFO_ONLY_OPTS flag. When working with a range option changes we try to pack the notifications as much as possible. v2: do full port (all vlans) notification only when creating/deleting vlans for compatibility, rework the range detection when changing options, add more verbose extack errors and check if a vlan should be used (br_vlan_should_use checks) Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_private.h | 8 ++++ net/bridge/br_vlan.c | 41 ++++++++++++++++---- net/bridge/br_vlan_options.c | 87 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 130 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index ac38f0b674b8..c8b87ad52c5b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -130,6 +130,7 @@ enum { #define BRIDGE_VLAN_INFO_RANGE_BEGIN (1<<3) /* VLAN is start of vlan range */ #define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */ #define BRIDGE_VLAN_INFO_BRENTRY (1<<5) /* Global bridge VLAN entry */ +#define BRIDGE_VLAN_INFO_ONLY_OPTS (1<<6) /* Skip create/delete/flags */ struct bridge_vlan_info { __u16 flags; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 403df71d2cfa..084904ee22a8 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -976,6 +976,8 @@ void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd); +bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -1197,6 +1199,12 @@ bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, const struct net_bridge_vlan *v2); bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); size_t br_vlan_opts_nl_size(void); +int br_vlan_process_options(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan *range_start, + struct net_bridge_vlan *range_end, + struct nlattr **tb, + struct netlink_ext_ack *extack); #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 75ec3da92b0b..d747756eac63 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1667,8 +1667,8 @@ out_kfree: } /* check if v_curr can enter a range ending in range_end */ -static bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, - const struct net_bridge_vlan *range_end) +bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && @@ -1824,11 +1824,11 @@ static int br_vlan_rtm_process_one(struct net_device *dev, { struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; + bool changed = false, skip_processing = false; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0, cmdmap = 0; struct net_bridge *br; - bool changed = false; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); @@ -1882,16 +1882,43 @@ static int br_vlan_rtm_process_one(struct net_device *dev, switch (cmd) { case RTM_NEWVLAN: cmdmap = RTM_SETLINK; + skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); break; case RTM_DELVLAN: cmdmap = RTM_DELLINK; break; } - err = br_process_vlan_info(br, p, cmdmap, vinfo, &vinfo_last, &changed, - extack); - if (changed) - br_ifinfo_notify(cmdmap, br, p); + if (!skip_processing) { + struct bridge_vlan_info *tmp_last = vinfo_last; + + /* br_process_vlan_info may overwrite vinfo_last */ + err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, + &changed, extack); + + /* notify first if anything changed */ + if (changed) + br_ifinfo_notify(cmdmap, br, p); + + if (err) + return err; + } + + /* deal with options */ + if (cmd == RTM_NEWVLAN) { + struct net_bridge_vlan *range_start, *range_end; + + if (vinfo_last) { + range_start = br_vlan_find(vg, vinfo_last->vid); + range_end = br_vlan_find(vg, vinfo->vid); + } else { + range_start = br_vlan_find(vg, vinfo->vid); + range_end = range_start; + } + + err = br_vlan_process_options(br, p, range_start, range_end, + tb, extack); + } return err; } diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index 55fcdc9c380c..27275ac3e42e 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -23,3 +23,90 @@ size_t br_vlan_opts_nl_size(void) { return 0; } + +static int br_vlan_process_one_opts(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + *changed = false; + return 0; +} + +int br_vlan_process_options(const struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_vlan *range_start, + struct net_bridge_vlan *range_end, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; + struct net_bridge_vlan_group *vg; + int vid, err = 0; + u16 pvid; + + if (p) + vg = nbp_vlan_group(p); + else + vg = br_vlan_group(br); + + if (!range_start || !br_vlan_should_use(range_start)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options"); + return -ENOENT; + } + if (!range_end || !br_vlan_should_use(range_end)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options"); + return -ENOENT; + } + + pvid = br_get_pvid(vg); + for (vid = range_start->vid; vid <= range_end->vid; vid++) { + bool changed = false; + + v = br_vlan_find(vg, vid); + if (!v || !br_vlan_should_use(v)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options"); + err = -ENOENT; + break; + } + + err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed, + extack); + if (err) + break; + + if (changed) { + /* vlan options changed, check for range */ + if (!curr_start) { + curr_start = v; + curr_end = v; + continue; + } + + if (v->vid == pvid || + !br_vlan_can_enter_range(v, curr_end)) { + br_vlan_notify(br, p, curr_start->vid, + curr_end->vid, RTM_NEWVLAN); + curr_start = v; + } + curr_end = v; + } else { + /* nothing changed and nothing to notify yet */ + if (!curr_start) + continue; + + br_vlan_notify(br, p, curr_start->vid, curr_end->vid, + RTM_NEWVLAN); + curr_start = NULL; + curr_end = NULL; + } + } + if (curr_start) + br_vlan_notify(br, p, curr_start->vid, curr_end->vid, + RTM_NEWVLAN); + + return err; +} -- cgit v1.2.3 From a580c76d534c7360ba68042b19cb255e8420e987 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 24 Jan 2020 13:40:22 +0200 Subject: net: bridge: vlan: add per-vlan state The first per-vlan option added is state, it is needed for EVPN and for per-vlan STP. The state allows to control the forwarding on per-vlan basis. The vlan state is considered only if the port state is forwarding in order to avoid conflicts and be consistent. br_allowed_egress is called only when the state is forwarding, but the ingress case is a bit more complicated due to the fact that we may have the transition between port:BR_STATE_FORWARDING -> vlan:BR_STATE_LEARNING which should still allow the bridge to learn from the packet after vlan filtering and it will be dropped after that. Also to optimize the pvid state check we keep a copy in the vlan group to avoid one lookup. The state members are modified with *_ONCE() to annotate the lockless access. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_device.c | 3 ++- net/bridge/br_input.c | 7 ++++-- net/bridge/br_private.h | 43 ++++++++++++++++++++++++++++++++-- net/bridge/br_vlan.c | 47 ++++++++++++++++++++++++++++---------- net/bridge/br_vlan_options.c | 52 ++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 134 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c8b87ad52c5b..42f7ca38ad80 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -191,6 +191,7 @@ enum { BRIDGE_VLANDB_ENTRY_UNSPEC, BRIDGE_VLANDB_ENTRY_INFO, BRIDGE_VLANDB_ENTRY_RANGE, + BRIDGE_VLANDB_ENTRY_STATE, __BRIDGE_VLANDB_ENTRY_MAX, }; #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index fb38add21b37..dc3d2c1dd9d5 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -32,6 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_mdb_entry *mdst; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; + u8 state = BR_STATE_FORWARDING; const unsigned char *dest; struct ethhdr *eth; u16 vid = 0; @@ -56,7 +57,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) goto out; if (IS_ENABLED(CONFIG_INET) && diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8944ceb47fe9..fcc260840028 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -76,11 +76,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb bool local_rcv, mcast_hit = false; struct net_bridge *br; u16 vid = 0; + u8 state; if (!p || p->state == BR_STATE_DISABLED) goto drop; - if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) + state = p->state; + if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, + &state)) goto out; nbp_switchdev_frame_mark(p, skb); @@ -103,7 +106,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } } - if (p->state == BR_STATE_LEARNING) + if (state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 084904ee22a8..5153ffe79a01 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -113,6 +113,7 @@ enum { * @vid: VLAN id * @flags: bridge vlan flags * @priv_flags: private (in-kernel) bridge vlan flags + * @state: STP state (e.g. blocking, learning, forwarding) * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct @@ -133,6 +134,7 @@ struct net_bridge_vlan { u16 vid; u16 flags; u16 priv_flags; + u8 state; struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; @@ -157,6 +159,7 @@ struct net_bridge_vlan { * @vlan_list: sorted VLAN entry list * @num_vlans: number of total VLAN entries * @pvid: PVID VLAN id + * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking) * * IMPORTANT: Be careful when checking if there're VLAN entries using list * primitives because the bridge can have entries in its list which @@ -170,6 +173,7 @@ struct net_bridge_vlan_group { struct list_head vlan_list; u16 num_vlans; u16 pvid; + u8 pvid_state; }; /* bridge fdb flags */ @@ -935,7 +939,7 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid); + u16 *vid, u8 *state); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); @@ -1037,7 +1041,7 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid) + u16 *vid, u8 *state) { return true; } @@ -1205,6 +1209,41 @@ int br_vlan_process_options(const struct net_bridge *br, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack); + +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) +{ + return READ_ONCE(v->state); +} + +static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) +{ + WRITE_ONCE(v->state, state); +} + +static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) +{ + return READ_ONCE(vg->pvid_state); +} + +static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg, + u8 state) +{ + WRITE_ONCE(vg->pvid_state, state); +} + +/* learn_allow is true at ingress and false at egress */ +static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) +{ + switch (state) { + case BR_STATE_LEARNING: + return learn_allow; + case BR_STATE_FORWARDING: + return true; + default: + return false; + } +} #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index d747756eac63..6b5deca08b89 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -34,13 +34,15 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } -static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid) +static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, + const struct net_bridge_vlan *v) { - if (vg->pvid == vid) + if (vg->pvid == v->vid) return false; smp_wmb(); - vg->pvid = vid; + br_vlan_set_pvid_state(vg, v->state); + vg->pvid = v->vid; return true; } @@ -69,7 +71,7 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) vg = nbp_vlan_group(v->port); if (flags & BRIDGE_VLAN_INFO_PVID) - ret = __vlan_add_pvid(vg, v->vid); + ret = __vlan_add_pvid(vg, v); else ret = __vlan_delete_pvid(vg, v->vid); @@ -293,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, vg->num_vlans++; } + /* set the state before publishing */ + v->state = BR_STATE_FORWARDING; + err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) @@ -466,7 +471,8 @@ out: /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, - struct sk_buff *skb, u16 *vid) + struct sk_buff *skb, u16 *vid, + u8 *state) { struct br_vlan_stats *stats; struct net_bridge_vlan *v; @@ -532,13 +538,25 @@ static bool __allowed_ingress(const struct net_bridge *br, skb->vlan_tci |= pvid; /* if stats are disabled we can avoid the lookup */ - if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) - return true; + if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { + if (*state == BR_STATE_FORWARDING) { + *state = br_vlan_get_pvid_state(vg); + return br_vlan_state_allowed(*state, true); + } else { + return true; + } + } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; + if (*state == BR_STATE_FORWARDING) { + *state = br_vlan_get_state(v); + if (!br_vlan_state_allowed(*state, true)) + goto drop; + } + if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); @@ -556,7 +574,7 @@ drop: bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid) + u16 *vid, u8 *state) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. @@ -566,7 +584,7 @@ bool br_allowed_ingress(const struct net_bridge *br, return true; } - return __allowed_ingress(br, vg, skb, vid); + return __allowed_ingress(br, vg, skb, vid, state); } /* Called under RCU. */ @@ -582,7 +600,8 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg, br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); - if (v && br_vlan_should_use(v)) + if (v && br_vlan_should_use(v) && + br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; @@ -593,6 +612,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; + struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) @@ -607,13 +627,15 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) if (!*vid) { *vid = br_get_pvid(vg); - if (!*vid) + if (!*vid || + !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } - if (br_vlan_find(vg, *vid)) + v = br_vlan_find(vg, *vid); + if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; @@ -1816,6 +1838,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN, .len = sizeof(struct bridge_vlan_info) }, [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index 27275ac3e42e..cd2eb194eb98 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -11,16 +11,54 @@ bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, const struct net_bridge_vlan *v2) { - return true; + return v1->state == v2->state; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { - return true; + return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, + br_vlan_get_state(v)); } size_t br_vlan_opts_nl_size(void) { + return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */ +} + +static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + u8 state, + bool *changed, + struct netlink_ext_ack *extack) +{ + struct net_bridge *br; + + ASSERT_RTNL(); + + if (state > BR_STATE_BLOCKING) { + NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state"); + return -EINVAL; + } + + if (br_vlan_is_brentry(v)) + br = v->br; + else + br = v->port->br; + + if (br->stp_enabled == BR_KERNEL_STP) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP"); + return -EBUSY; + } + + if (v->state == state) + return 0; + + if (v->vid == br_get_pvid(vg)) + br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); + *changed = true; + return 0; } @@ -32,7 +70,17 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, bool *changed, struct netlink_ext_ack *extack) { + int err; + *changed = false; + if (tb[BRIDGE_VLANDB_ENTRY_STATE]) { + u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]); + + err = br_vlan_modify_state(vg, v, state, changed, extack); + if (err) + return err; + } + return 0; } -- cgit v1.2.3 From f870fa0b5768842cb4690c1c11f19f28b731ae6d Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:15 -0800 Subject: mptcp: Add MPTCP socket stubs Implements the infrastructure for MPTCP sockets. MPTCP sockets open one in-kernel TCP socket per subflow. These subflow sockets are only managed by the MPTCP socket that owns them and are not visible from userspace. This commit allows a userspace program to open an MPTCP socket with: sock = socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); The resulting socket is simply a wrapper around a single regular TCP socket, without any of the MPTCP protocol implemented over the wire. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- MAINTAINERS | 1 + include/net/mptcp.h | 16 ++++++ net/Kconfig | 1 + net/Makefile | 1 + net/ipv4/tcp.c | 2 + net/ipv6/tcp_ipv6.c | 7 +++ net/mptcp/Kconfig | 16 ++++++ net/mptcp/Makefile | 4 ++ net/mptcp/protocol.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 22 ++++++++ 10 files changed, 212 insertions(+) create mode 100644 net/mptcp/Kconfig create mode 100644 net/mptcp/Makefile create mode 100644 net/mptcp/protocol.c create mode 100644 net/mptcp/protocol.h (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index 702382b89c37..4be9c1b50183 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11583,6 +11583,7 @@ W: https://github.com/multipath-tcp/mptcp_net-next/wiki B: https://github.com/multipath-tcp/mptcp_net-next/issues S: Maintained F: include/net/mptcp.h +F: net/mptcp/ NETWORKING [TCP] M: Eric Dumazet diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0573ae75c3db..98ba22379117 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -28,6 +28,8 @@ struct mptcp_ext { #ifdef CONFIG_MPTCP +void mptcp_init(void); + /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ @@ -70,6 +72,10 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, #else +static inline void mptcp_init(void) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { @@ -82,4 +88,14 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, } #endif /* CONFIG_MPTCP */ + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcpv6_init(void); +#elif IS_ENABLED(CONFIG_IPV6) +static inline int mptcpv6_init(void) +{ + return 0; +} +#endif + #endif /* __NET_MPTCP_H */ diff --git a/net/Kconfig b/net/Kconfig index 54916b7adb9b..b0937a700f01 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -91,6 +91,7 @@ if INET source "net/ipv4/Kconfig" source "net/ipv6/Kconfig" source "net/netlabel/Kconfig" +source "net/mptcp/Kconfig" endif # if INET diff --git a/net/Makefile b/net/Makefile index 848303d98d3d..07ea48160874 100644 --- a/net/Makefile +++ b/net/Makefile @@ -87,3 +87,4 @@ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ +obj-$(CONFIG_MPTCP) += mptcp/ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6711a97de3ce..7dfb78caedaf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include #include #include +#include #include #include #include @@ -4021,4 +4022,5 @@ void __init tcp_init(void) tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); + mptcp_init(); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5b5260103b65..60068ffde1d9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2163,9 +2163,16 @@ int __init tcpv6_init(void) ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; + + ret = mptcpv6_init(); + if (ret) + goto out_tcpv6_pernet_subsys; + out: return ret; +out_tcpv6_pernet_subsys: + unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 index 000000000000..c1a99f07a4cd --- /dev/null +++ b/net/mptcp/Kconfig @@ -0,0 +1,16 @@ + +config MPTCP + bool "MPTCP: Multipath TCP" + depends on INET + select SKB_EXTENSIONS + help + Multipath TCP (MPTCP) connections send and receive data over multiple + subflows in order to utilize multiple network paths. Each subflow + uses the TCP protocol, and TCP options carry header information for + MPTCP. + +config MPTCP_IPV6 + bool "MPTCP: IPv6 support for Multipath TCP" + depends on MPTCP + select IPV6 + default y diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile new file mode 100644 index 000000000000..659129d1fcbf --- /dev/null +++ b/net/mptcp/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MPTCP) += mptcp.o + +mptcp-y := protocol.o diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c new file mode 100644 index 000000000000..5e24e7cf7d70 --- /dev/null +++ b/net/mptcp/protocol.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "protocol.h" + +static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *subflow = msk->subflow; + + if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + return -EOPNOTSUPP; + + return sock_sendmsg(subflow, msg); +} + +static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *subflow = msk->subflow; + + if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) + return -EOPNOTSUPP; + + return sock_recvmsg(subflow, msg, flags); +} + +static int mptcp_init_sock(struct sock *sk) +{ + return 0; +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + inet_sk_state_store(sk, TCP_CLOSE); + + if (msk->subflow) { + pr_debug("subflow=%p", msk->subflow->sk); + sock_release(msk->subflow); + } + + sock_orphan(sk); + sock_put(sk); +} + +static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int err; + + saddr->sa_family = AF_INET; + + pr_debug("msk=%p, subflow=%p", msk, msk->subflow->sk); + + err = kernel_connect(msk->subflow, saddr, len, 0); + + sk->sk_state = TCP_ESTABLISHED; + + return err; +} + +static struct proto mptcp_prot = { + .name = "MPTCP", + .owner = THIS_MODULE, + .init = mptcp_init_sock, + .close = mptcp_close, + .accept = inet_csk_accept, + .connect = mptcp_connect, + .shutdown = tcp_shutdown, + .sendmsg = mptcp_sendmsg, + .recvmsg = mptcp_recvmsg, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, + .obj_size = sizeof(struct mptcp_sock), + .no_autobind = true, +}; + +static struct inet_protosw mptcp_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_prot, + .ops = &inet_stream_ops, +}; + +void __init mptcp_init(void) +{ + if (proto_register(&mptcp_prot, 1) != 0) + panic("Failed to register MPTCP proto.\n"); + + inet_register_protosw(&mptcp_protosw); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct proto mptcp_v6_prot; + +static struct inet_protosw mptcp_v6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + .prot = &mptcp_v6_prot, + .ops = &inet6_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +int mptcpv6_init(void) +{ + int err; + + mptcp_v6_prot = mptcp_prot; + strcpy(mptcp_v6_prot.name, "MPTCPv6"); + mptcp_v6_prot.slab = NULL; + mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) + + sizeof(struct ipv6_pinfo); + + err = proto_register(&mptcp_v6_prot, 1); + if (err) + return err; + + err = inet6_register_protosw(&mptcp_v6_protosw); + if (err) + proto_unregister(&mptcp_v6_prot); + + return err; +} +#endif diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h new file mode 100644 index 000000000000..ee04a01bffd3 --- /dev/null +++ b/net/mptcp/protocol.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __MPTCP_PROTOCOL_H +#define __MPTCP_PROTOCOL_H + +/* MPTCP connection sock */ +struct mptcp_sock { + /* inet_connection_sock must be the first member */ + struct inet_connection_sock sk; + struct socket *subflow; /* outgoing connect/listener/!mp_capable */ +}; + +static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) +{ + return (struct mptcp_sock *)sk; +} + +#endif /* __MPTCP_PROTOCOL_H */ -- cgit v1.2.3 From eda7acddf8080bb2d022a8d4b8b2345eb80c63ec Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:16 -0800 Subject: mptcp: Handle MPTCP TCP options Add hooks to parse and format the MP_CAPABLE option. This option is handled according to MPTCP version 0 (RFC6824). MPTCP version 1 MP_CAPABLE (RFC6824bis/RFC8684) will be added later in coordination with related code changes. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 18 ++++++++++ include/net/mptcp.h | 18 ++++++++++ net/ipv4/tcp_input.c | 5 +++ net/ipv4/tcp_output.c | 13 +++++++ net/mptcp/Makefile | 2 +- net/mptcp/options.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 29 +++++++++++++++ 7 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 net/mptcp/options.c (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca6f01531e64..52798ab00394 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -78,6 +78,16 @@ struct tcp_sack_block { #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ +#if IS_ENABLED(CONFIG_MPTCP) +struct mptcp_options_received { + u64 sndr_key; + u64 rcvr_key; + u8 mp_capable : 1, + mp_join : 1, + dss : 1; +}; +#endif + struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -95,6 +105,9 @@ struct tcp_options_received { u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ +#if IS_ENABLED(CONFIG_MPTCP) + struct mptcp_options_received mptcp; +#endif }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) @@ -104,6 +117,11 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif +#if IS_ENABLED(CONFIG_MPTCP) + rx_opt->mptcp.mp_capable = 0; + rx_opt->mptcp.mp_join = 0; + rx_opt->mptcp.dss = 0; +#endif } /* This is the max number of SACKS that we'll generate and process. It's safe diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 98ba22379117..3daec2ceb3ff 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -9,6 +9,7 @@ #define __NET_MPTCP_H #include +#include #include /* MPTCP sk_buff extension data */ @@ -26,10 +27,22 @@ struct mptcp_ext { /* one byte hole */ }; +struct mptcp_out_options { +#if IS_ENABLED(CONFIG_MPTCP) + u16 suboptions; + u64 sndr_key; + u64 rcvr_key; +#endif +}; + #ifdef CONFIG_MPTCP void mptcp_init(void); +void mptcp_parse_option(const unsigned char *ptr, int opsize, + struct tcp_options_received *opt_rx); +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); + /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ @@ -76,6 +89,11 @@ static inline void mptcp_init(void) { } +static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, + struct tcp_options_received *opt_rx) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 358365598216..3458ee13e6f0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,6 +79,7 @@ #include #include #include +#include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -3924,6 +3925,10 @@ void tcp_parse_options(const struct net *net, */ break; #endif + case TCPOPT_MPTCP: + mptcp_parse_option(ptr, opsize, opt_rx); + break; + case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 786978cb2db7..0f0984f39f67 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include @@ -414,6 +415,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) +#define OPTION_MPTCP (1 << 10) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -439,8 +441,17 @@ struct tcp_out_options { __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ + struct mptcp_out_options mptcp; }; +static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +{ +#if IS_ENABLED(CONFIG_MPTCP) + if (unlikely(OPTION_MPTCP & opts->options)) + mptcp_write_options(ptr, &opts->mptcp); +#endif +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -549,6 +560,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } smc_options_write(ptr, &options); + + mptcp_options_write(ptr, opts); } static void smc_set_option(const struct tcp_sock *tp, diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 659129d1fcbf..27a846263f08 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o +mptcp-y := protocol.o options.o diff --git a/net/mptcp/options.c b/net/mptcp/options.c new file mode 100644 index 000000000000..b7a31c0e5283 --- /dev/null +++ b/net/mptcp/options.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#include +#include +#include +#include "protocol.h" + +void mptcp_parse_option(const unsigned char *ptr, int opsize, + struct tcp_options_received *opt_rx) +{ + struct mptcp_options_received *mp_opt = &opt_rx->mptcp; + u8 subtype = *ptr >> 4; + u8 version; + u8 flags; + + switch (subtype) { + case MPTCPOPT_MP_CAPABLE: + if (opsize != TCPOLEN_MPTCP_MPC_SYN && + opsize != TCPOLEN_MPTCP_MPC_ACK) + break; + + version = *ptr++ & MPTCP_VERSION_MASK; + if (version != MPTCP_SUPPORTED_VERSION) + break; + + flags = *ptr++; + if (!((flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA1) || + (flags & MPTCP_CAP_EXTENSIBILITY)) + break; + + /* RFC 6824, Section 3.1: + * "For the Checksum Required bit (labeled "A"), if either + * host requires the use of checksums, checksums MUST be used. + * In other words, the only way for checksums not to be used + * is if both hosts in their SYNs set A=0." + * + * Section 3.3.0: + * "If a checksum is not present when its use has been + * negotiated, the receiver MUST close the subflow with a RST as + * it is considered broken." + * + * We don't implement DSS checksum - fall back to TCP. + */ + if (flags & MPTCP_CAP_CHECKSUM_REQD) + break; + + mp_opt->mp_capable = 1; + mp_opt->sndr_key = get_unaligned_be64(ptr); + ptr += 8; + + if (opsize == TCPOLEN_MPTCP_MPC_ACK) { + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + pr_debug("MP_CAPABLE sndr=%llu, rcvr=%llu", + mp_opt->sndr_key, mp_opt->rcvr_key); + } else { + pr_debug("MP_CAPABLE sndr=%llu", mp_opt->sndr_key); + } + break; + + case MPTCPOPT_DSS: + pr_debug("DSS"); + mp_opt->dss = 1; + break; + + default: + break; + } +} + +void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +{ + if ((OPTION_MPTCP_MPC_SYN | + OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + u8 len; + + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYN; + else + len = TCPOLEN_MPTCP_MPC_ACK; + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | + (MPTCPOPT_MP_CAPABLE << 12) | + (MPTCP_SUPPORTED_VERSION << 8) | + MPTCP_CAP_HMAC_SHA1); + put_unaligned_be64(opts->sndr_key, ptr); + ptr += 2; + if (OPTION_MPTCP_MPC_ACK & opts->suboptions) { + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + } + } +} diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ee04a01bffd3..c59cf8b220b0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -7,6 +7,35 @@ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H +#define MPTCP_SUPPORTED_VERSION 0 + +/* MPTCP option bits */ +#define OPTION_MPTCP_MPC_SYN BIT(0) +#define OPTION_MPTCP_MPC_SYNACK BIT(1) +#define OPTION_MPTCP_MPC_ACK BIT(2) + +/* MPTCP option subtypes */ +#define MPTCPOPT_MP_CAPABLE 0 +#define MPTCPOPT_MP_JOIN 1 +#define MPTCPOPT_DSS 2 +#define MPTCPOPT_ADD_ADDR 3 +#define MPTCPOPT_RM_ADDR 4 +#define MPTCPOPT_MP_PRIO 5 +#define MPTCPOPT_MP_FAIL 6 +#define MPTCPOPT_MP_FASTCLOSE 7 + +/* MPTCP suboption lengths */ +#define TCPOLEN_MPTCP_MPC_SYN 12 +#define TCPOLEN_MPTCP_MPC_SYNACK 12 +#define TCPOLEN_MPTCP_MPC_ACK 20 + +/* MPTCP MP_CAPABLE flags */ +#define MPTCP_VERSION_MASK (0x0F) +#define MPTCP_CAP_CHECKSUM_REQD BIT(7) +#define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_HMAC_SHA1 BIT(0) +#define MPTCP_CAP_FLAG_MASK (0x3F) + /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ -- cgit v1.2.3 From 2303f994b3e187091fd08148066688b08f837efc Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:17 -0800 Subject: mptcp: Associate MPTCP context with TCP socket Use ULP to associate a subflow_context structure with each TCP subflow socket. Creating these sockets requires new bind and connect functions to make sure ULP is set up immediately when the subflow sockets are created. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++ net/mptcp/Makefile | 2 +- net/mptcp/protocol.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++--- net/mptcp/protocol.h | 26 ++++++++++ net/mptcp/subflow.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 275 insertions(+), 7 deletions(-) create mode 100644 net/mptcp/subflow.c (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 52798ab00394..877947475814 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -397,6 +397,9 @@ struct tcp_sock { u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ +#if IS_ENABLED(CONFIG_MPTCP) + bool is_mptcp; +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 27a846263f08..e1ee5aade8b0 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o options.o +mptcp-y := protocol.o subflow.o options.o diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5e24e7cf7d70..294b03a0393a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -17,6 +17,53 @@ #include #include "protocol.h" +#define MPTCP_SAME_STATE TCP_MAX_STATES + +/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not + * completed yet or has failed, return the subflow socket. + * Otherwise return NULL. + */ +static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) +{ + if (!msk->subflow) + return NULL; + + return msk->subflow; +} + +static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) +{ + return ((struct sock *)msk)->sk_state == TCP_CLOSE; +} + +static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int err; + + ssock = __mptcp_nmpc_socket(msk); + if (ssock) + goto set_state; + + if (!__mptcp_can_create_subflow(msk)) + return ERR_PTR(-EINVAL); + + err = mptcp_subflow_create_socket(sk, &ssock); + if (err) + return ERR_PTR(err); + + msk->subflow = ssock; + subflow = mptcp_subflow_ctx(ssock->sk); + subflow->request_mptcp = 1; + +set_state: + if (state != MPTCP_SAME_STATE) + inet_sk_state_store(sk, state); + return ssock; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -48,12 +95,14 @@ static int mptcp_init_sock(struct sock *sk) static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; inet_sk_state_store(sk, TCP_CLOSE); - if (msk->subflow) { - pr_debug("subflow=%p", msk->subflow->sk); - sock_release(msk->subflow); + ssock = __mptcp_nmpc_socket(msk); + if (ssock) { + pr_debug("subflow=%p", mptcp_subflow_ctx(ssock->sk)); + sock_release(ssock); } sock_orphan(sk); @@ -67,7 +116,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) saddr->sa_family = AF_INET; - pr_debug("msk=%p, subflow=%p", msk, msk->subflow->sk); + pr_debug("msk=%p, subflow=%p", msk, + mptcp_subflow_ctx(msk->subflow->sk)); err = kernel_connect(msk->subflow, saddr, len, 0); @@ -93,15 +143,79 @@ static struct proto mptcp_prot = { .no_autobind = true, }; +static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err = -ENOTSUPP; + + if (uaddr->sa_family != AF_INET) // @@ allow only IPv4 for now + return err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->bind(ssock, uaddr, addr_len); + +unlock: + release_sock(sock->sk); + return err; +} + +static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_SYN_SENT); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + +unlock: + release_sock(sock->sk); + return err; +} + +static __poll_t mptcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + __poll_t mask = 0; + + return mask; +} + +static struct proto_ops mptcp_stream_ops; + static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, - .ops = &inet_stream_ops, + .ops = &mptcp_stream_ops, + .flags = INET_PROTOSW_ICSK, }; void __init mptcp_init(void) { + mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + mptcp_stream_ops = inet_stream_ops; + mptcp_stream_ops.bind = mptcp_bind; + mptcp_stream_ops.connect = mptcp_stream_connect; + mptcp_stream_ops.poll = mptcp_poll; + + mptcp_subflow_init(); + if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); @@ -109,13 +223,14 @@ void __init mptcp_init(void) } #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct proto_ops mptcp_v6_stream_ops; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, - .ops = &inet6_stream_ops, + .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; @@ -133,6 +248,11 @@ int mptcpv6_init(void) if (err) return err; + mptcp_v6_stream_ops = inet6_stream_ops; + mptcp_v6_stream_ops.bind = mptcp_bind; + mptcp_v6_stream_ops.connect = mptcp_stream_connect; + mptcp_v6_stream_ops.poll = mptcp_poll; + err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c59cf8b220b0..543d4d5d8985 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -48,4 +48,30 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +/* MPTCP subflow context */ +struct mptcp_subflow_context { + u32 request_mptcp : 1; /* send MP_CAPABLE */ + struct sock *tcp_sock; /* tcp sk backpointer */ + struct sock *conn; /* parent mptcp_sock */ + struct rcu_head rcu; +}; + +static inline struct mptcp_subflow_context * +mptcp_subflow_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* Use RCU on icsk_ulp_data only for sock diag code */ + return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; +} + +static inline struct sock * +mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) +{ + return subflow->tcp_sock; +} + +void mptcp_subflow_init(void); +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c new file mode 100644 index 000000000000..bf8139353653 --- /dev/null +++ b/net/mptcp/subflow.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "protocol.h" + +int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) +{ + struct mptcp_subflow_context *subflow; + struct net *net = sock_net(sk); + struct socket *sf; + int err; + + err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf); + if (err) + return err; + + lock_sock(sf->sk); + + /* kernel sockets do not by default acquire net ref, but TCP timer + * needs it. + */ + sf->sk->sk_net_refcnt = 1; + get_net(net); + this_cpu_add(*net->core.sock_inuse, 1); + err = tcp_set_ulp(sf->sk, "mptcp"); + release_sock(sf->sk); + + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + pr_debug("subflow=%p", subflow); + + *new_sock = sf; + subflow->conn = sk; + + return 0; +} + +static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, + gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct mptcp_subflow_context *ctx; + + ctx = kzalloc(sizeof(*ctx), priority); + if (!ctx) + return NULL; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + + pr_debug("subflow=%p", ctx); + + ctx->tcp_sock = sk; + + return ctx; +} + +static int subflow_ulp_init(struct sock *sk) +{ + struct mptcp_subflow_context *ctx; + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + + /* disallow attaching ULP to a socket unless it has been + * created with sock_create_kern() + */ + if (!sk->sk_kern_sock) { + err = -EOPNOTSUPP; + goto out; + } + + ctx = subflow_create_ctx(sk, GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + + tp->is_mptcp = 1; +out: + return err; +} + +static void subflow_ulp_release(struct sock *sk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + + if (!ctx) + return; + + kfree_rcu(ctx, rcu); +} + +static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { + .name = "mptcp", + .owner = THIS_MODULE, + .init = subflow_ulp_init, + .release = subflow_ulp_release, +}; + +void mptcp_subflow_init(void) +{ + if (tcp_register_ulp(&subflow_ulp_ops) != 0) + panic("MPTCP: failed to register subflows to ULP\n"); +} -- cgit v1.2.3 From cec37a6e41aae7bf3df9a3da783380a4d9325fd8 Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:18 -0800 Subject: mptcp: Handle MP_CAPABLE options for outgoing connections Add hooks to tcp_output.c to add MP_CAPABLE to an outgoing SYN request, to capture the MP_CAPABLE in the received SYN-ACK, to add MP_CAPABLE to the final ACK of the three-way handshake. Use the .sk_rx_dst_set() handler in the subflow proto to capture when the responding SYN-ACK is received and notify the MPTCP connection layer. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 + include/net/mptcp.h | 57 +++++++++++ net/ipv4/tcp_input.c | 6 ++ net/ipv4/tcp_output.c | 44 +++++++++ net/ipv6/tcp_ipv6.c | 6 ++ net/mptcp/options.c | 100 +++++++++++++++++++ net/mptcp/protocol.c | 163 +++++++++++++++++++++++++----- net/mptcp/protocol.h | 40 +++++++- net/mptcp/subflow.c | 268 +++++++++++++++++++++++++++++++++++++++++++++++++- 9 files changed, 663 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 877947475814..e9ee06d887fa 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -137,6 +137,9 @@ struct tcp_request_sock { const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; +#if IS_ENABLED(CONFIG_MPTCP) + bool is_mptcp; +#endif u32 txhash; u32 rcv_isn; u32 snt_isn; diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3daec2ceb3ff..eabc57c3fde4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -39,8 +39,27 @@ struct mptcp_out_options { void mptcp_init(void); +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return tcp_sk(sk)->is_mptcp; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return tcp_rsk(req)->is_mptcp; +} + void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts); +void mptcp_rcv_synsent(struct sock *sk); +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts); +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts); + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); /* move the skb extension owership, with the assumption that 'to' is @@ -89,11 +108,47 @@ static inline void mptcp_init(void) { } +static inline bool sk_is_mptcp(const struct sock *sk) +{ + return false; +} + +static inline bool rsk_is_mptcp(const struct request_sock *req) +{ + return false; +} + static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } +static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline void mptcp_rcv_synsent(struct sock *sk) +{ +} + +static inline bool mptcp_synack_options(const struct request_sock *req, + unsigned int *size, + struct mptcp_out_options *opts) +{ + return false; +} + +static inline bool mptcp_established_options(struct sock *sk, + struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + return false; +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { @@ -107,6 +162,8 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, #endif /* CONFIG_MPTCP */ +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped); + #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); #elif IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3458ee13e6f0..5165c8de47ee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5978,6 +5978,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); + if (sk_is_mptcp(sk)) + mptcp_rcv_synsent(sk); + /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ @@ -6600,6 +6603,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; +#if IS_ENABLED(CONFIG_MPTCP) + tcp_rsk(req)->is_mptcp = 0; +#endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0f0984f39f67..5456076166da 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -597,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp, #endif } +static void mptcp_set_option_cond(const struct request_sock *req, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (rsk_is_mptcp(req)) { + unsigned int size; + + if (mptcp_synack_options(req, &size, &opts->mptcp)) { + if (*remaining >= size) { + opts->options |= OPTION_MPTCP; + *remaining -= size; + } + } + } +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -666,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, smc_set_option(tp, opts, &remaining); + if (sk_is_mptcp(sk)) { + unsigned int size; + + if (mptcp_syn_options(sk, &size, &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -727,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } } + mptcp_set_option_cond(req, opts, &remaining); + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -764,6 +791,23 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb size += TCPOLEN_TSTAMP_ALIGNED; } + /* MPTCP options have precedence over SACK for the limited TCP + * option space because a MPTCP connection would be forced to + * fall back to regular TCP if a required multipath option is + * missing. SACK still gets a chance to use whatever space is + * left. + */ + if (sk_is_mptcp(sk)) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int opt_size = 0; + + if (mptcp_established_options(sk, skb, &opt_size, remaining, + &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + size += opt_size; + } + } + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 60068ffde1d9..33a578a3eb3a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -238,6 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(sk)) + mptcp_handle_ipv6_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_mapped_specific; @@ -248,6 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &ipv6_specific; + if (sk_is_mptcp(sk)) + mptcp_handle_ipv6_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; @@ -1203,6 +1207,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(newsk)) + mptcp_handle_ipv6_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG newtp->af_specific = &tcp_sock_ipv6_mapped_specific; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b7a31c0e5283..52ff2301b68b 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -72,14 +72,114 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, } } +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + + ptr = (const unsigned char *)(th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + if (opcode == TCPOPT_MPTCP) + mptcp_parse_option(ptr, opsize, opt_rx); + ptr += opsize - 2; + length -= opsize; + } + } +} + +bool mptcp_syn_options(struct sock *sk, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (subflow->request_mptcp) { + pr_debug("local_key=%llu", subflow->local_key); + opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->sndr_key = subflow->local_key; + *size = TCPOLEN_MPTCP_MPC_SYN; + return true; + } + return false; +} + +void mptcp_rcv_synsent(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct tcp_sock *tp = tcp_sk(sk); + + pr_debug("subflow=%p", subflow); + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + } else { + tcp_sk(sk)->is_mptcp = 0; + } +} + +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (subflow->mp_capable && !subflow->fourth_ack) { + opts->suboptions = OPTION_MPTCP_MPC_ACK; + opts->sndr_key = subflow->local_key; + opts->rcvr_key = subflow->remote_key; + *size = TCPOLEN_MPTCP_MPC_ACK; + subflow->fourth_ack = 1; + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu", + subflow, subflow->local_key, subflow->remote_key); + return true; + } + return false; +} + +bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + if (subflow_req->mp_capable) { + opts->suboptions = OPTION_MPTCP_MPC_SYNACK; + opts->sndr_key = subflow_req->local_key; + *size = TCPOLEN_MPTCP_MPC_SYNACK; + pr_debug("subflow_req=%p, local_key=%llu", + subflow_req, subflow_req->local_key); + return true; + } + return false; +} + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | + OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { u8 len; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) len = TCPOLEN_MPTCP_MPC_SYN; + else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + len = TCPOLEN_MPTCP_MPC_SYNACK; else len = TCPOLEN_MPTCP_MPC_ACK; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 294b03a0393a..bdd58da1e4f6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -25,12 +25,28 @@ */ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) { - if (!msk->subflow) + if (!msk->subflow || mptcp_subflow_ctx(msk->subflow->sk)->fourth_ack) return NULL; return msk->subflow; } +/* if msk has a single subflow, and the mp_capable handshake is failed, + * return it. + * Otherwise returns NULL + */ +static struct socket *__mptcp_tcp_fallback(const struct mptcp_sock *msk) +{ + struct socket *ssock = __mptcp_nmpc_socket(msk); + + sock_owned_by_me((const struct sock *)msk); + + if (!ssock || sk_is_mptcp(ssock->sk)) + return NULL; + + return ssock; +} + static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) { return ((struct sock *)msk)->sk_state == TCP_CLOSE; @@ -56,6 +72,7 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); + list_add(&subflow->node, &msk->conn_list); subflow->request_mptcp = 1; set_state: @@ -64,66 +81,169 @@ set_state: return ssock; } +static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; + struct socket *ssock; + struct sock *ssk; + int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - return sock_sendmsg(subflow, msg); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + pr_debug("fallback passthrough"); + ret = sock_sendmsg(ssock, msg); + release_sock(sk); + return ret; + } + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + ret = sock_sendmsg(ssk->sk_socket, msg); + + release_sock(sk); + return ret; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *subflow = msk->subflow; + struct socket *ssock; + struct sock *ssk; + int copied = 0; if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - return sock_recvmsg(subflow, msg, flags); + lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + pr_debug("fallback-read subflow=%p", + mptcp_subflow_ctx(ssock->sk)); + copied = sock_recvmsg(ssock, msg, flags); + release_sock(sk); + return copied; + } + + ssk = mptcp_subflow_get(msk); + if (!ssk) { + release_sock(sk); + return -ENOTCONN; + } + + copied = sock_recvmsg(ssk->sk_socket, msg, flags); + + release_sock(sk); + + return copied; +} + +/* subflow sockets can be either outgoing (connect) or incoming + * (accept). + * + * Outgoing subflows use in-kernel sockets. + * Incoming subflows do not have their own 'struct socket' allocated, + * so we need to use tcp_close() after detaching them from the mptcp + * parent socket. + */ +static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + long timeout) +{ + struct socket *sock = READ_ONCE(ssk->sk_socket); + + list_del(&subflow->node); + + if (sock && sock != sk->sk_socket) { + /* outgoing subflow */ + sock_release(sock); + } else { + /* incoming subflow */ + tcp_close(ssk, timeout); + } } static int mptcp_init_sock(struct sock *sk) { + struct mptcp_sock *msk = mptcp_sk(sk); + + INIT_LIST_HEAD(&msk->conn_list); + return 0; } static void mptcp_close(struct sock *sk, long timeout) { + struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; inet_sk_state_store(sk, TCP_CLOSE); - ssock = __mptcp_nmpc_socket(msk); - if (ssock) { - pr_debug("subflow=%p", mptcp_subflow_ctx(ssock->sk)); - sock_release(ssock); + lock_sock(sk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + __mptcp_close_ssk(sk, ssk, subflow, timeout); } - sock_orphan(sk); - sock_put(sk); + release_sock(sk); + sk_common_release(sk); } -static int mptcp_connect(struct sock *sk, struct sockaddr *saddr, int len) +static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - int err; + struct socket *ssock; - saddr->sa_family = AF_INET; + ssock = __mptcp_nmpc_socket(msk); + pr_debug("msk=%p, subflow=%p", msk, ssock); + if (WARN_ON_ONCE(!ssock)) + return -EINVAL; - pr_debug("msk=%p, subflow=%p", msk, - mptcp_subflow_ctx(msk->subflow->sk)); + return inet_csk_get_port(ssock->sk, snum); +} - err = kernel_connect(msk->subflow, saddr, len, 0); +void mptcp_finish_connect(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + struct sock *sk; - sk->sk_state = TCP_ESTABLISHED; + subflow = mptcp_subflow_ctx(ssk); - return err; + if (!subflow->mp_capable) + return; + + sk = subflow->conn; + msk = mptcp_sk(sk); + + /* the socket is not connected yet, no msk/subflow ops can access/race + * accessing the field below + */ + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->local_key, subflow->local_key); } static struct proto mptcp_prot = { @@ -132,13 +252,12 @@ static struct proto mptcp_prot = { .init = mptcp_init_sock, .close = mptcp_close, .accept = inet_csk_accept, - .connect = mptcp_connect, .shutdown = tcp_shutdown, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .hash = inet_hash, .unhash = inet_unhash, - .get_port = inet_csk_get_port, + .get_port = mptcp_get_port, .obj_size = sizeof(struct mptcp_sock), .no_autobind = true, }; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 543d4d5d8985..bd66e7415515 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -40,19 +40,47 @@ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; + u64 local_key; + u64 remote_key; + struct list_head conn_list; struct socket *subflow; /* outgoing connect/listener/!mp_capable */ }; +#define mptcp_for_each_subflow(__msk, __subflow) \ + list_for_each_entry(__subflow, &((__msk)->conn_list), node) + static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) { return (struct mptcp_sock *)sk; } +struct mptcp_subflow_request_sock { + struct tcp_request_sock sk; + u8 mp_capable : 1, + mp_join : 1, + backup : 1; + u64 local_key; + u64 remote_key; +}; + +static inline struct mptcp_subflow_request_sock * +mptcp_subflow_rsk(const struct request_sock *rsk) +{ + return (struct mptcp_subflow_request_sock *)rsk; +} + /* MPTCP subflow context */ struct mptcp_subflow_context { - u32 request_mptcp : 1; /* send MP_CAPABLE */ + struct list_head node;/* conn_list of subflows */ + u64 local_key; + u64 remote_key; + u32 request_mptcp : 1, /* send MP_CAPABLE */ + mp_capable : 1, /* remote is MPTCP capable */ + fourth_ack : 1, /* send initial DSS */ + conn_finished : 1; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ + const struct inet_connection_sock_af_ops *icsk_af_ops; struct rcu_head rcu; }; @@ -74,4 +102,14 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) void mptcp_subflow_init(void); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +extern const struct inet_connection_sock_af_ops ipv4_specific; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +extern const struct inet_connection_sock_af_ops ipv6_specific; +#endif + +void mptcp_get_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx); + +void mptcp_finish_connect(struct sock *sk); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf8139353653..df3192305967 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -12,9 +12,188 @@ #include #include #include +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include +#endif #include #include "protocol.h" +static void subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct tcp_options_received rx_opt; + + pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + + memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); + mptcp_get_options(skb, &rx_opt); + + subflow_req->mp_capable = 0; + +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + return; +#endif + + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { + subflow_req->mp_capable = 1; + subflow_req->remote_key = rx_opt.mptcp.sndr_key; + } +} + +static void subflow_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static void subflow_v6_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + tcp_rsk(req)->is_mptcp = 1; + + tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + + subflow_init_req(req, sk_listener, skb); +} +#endif + +static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); + + if (subflow->conn && !subflow->conn_finished) { + pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), + subflow->remote_key); + mptcp_finish_connect(sk); + subflow->conn_finished = 1; + } +} + +static struct request_sock_ops subflow_request_sock_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; + +static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + /* Never answer to SYNs sent to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv4_ops, + sk, skb); +drop: + tcp_listendrop(sk); + return 0; +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; +static struct inet_connection_sock_af_ops subflow_v6_specific; +static struct inet_connection_sock_af_ops subflow_v6m_specific; + +static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + pr_debug("subflow=%p", subflow); + + if (skb->protocol == htons(ETH_P_IP)) + return subflow_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + return tcp_conn_request(&subflow_request_sock_ops, + &subflow_request_sock_ipv6_ops, sk, skb); + +drop: + tcp_listendrop(sk); + return 0; /* don't send reset */ +} +#endif + +static struct sock *subflow_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct sock *child; + + pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + + /* if the sk is MP_CAPABLE, we already received the client key */ + + child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req); + + if (child && *own_req) { + if (!mptcp_subflow_ctx(child)) { + pr_debug("Closing child socket"); + inet_sk_set_state(child, TCP_CLOSE); + sock_set_flag(child, SOCK_DEAD); + inet_csk_destroy_sock(child); + child = NULL; + } + } + + return child; +} + +static struct inet_connection_sock_af_ops subflow_specific; + +static struct inet_connection_sock_af_ops * +subflow_default_af_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (sk->sk_family == AF_INET6) + return &subflow_v6_specific; +#endif + return &subflow_specific; +} + +void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock_af_ops *target; + + target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); + + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); + + if (likely(icsk->icsk_af_ops == target)) + return; + + subflow->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = target; +#endif +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -22,7 +201,8 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) struct socket *sf; int err; - err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf); + err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP, + &sf); if (err) return err; @@ -60,6 +240,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + INIT_LIST_HEAD(&ctx->node); pr_debug("subflow=%p", ctx); @@ -70,6 +251,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, static int subflow_ulp_init(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; @@ -91,6 +273,8 @@ static int subflow_ulp_init(struct sock *sk) pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); tp->is_mptcp = 1; + ctx->icsk_af_ops = icsk->icsk_af_ops; + icsk->icsk_af_ops = subflow_default_af_ops(sk); out: return err; } @@ -105,15 +289,97 @@ static void subflow_ulp_release(struct sock *sk) kfree_rcu(ctx, rcu); } +static void subflow_ulp_fallback(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_ulp_ops = NULL; + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + tcp_sk(sk)->is_mptcp = 0; +} + +static void subflow_ulp_clone(const struct request_sock *req, + struct sock *newsk, + const gfp_t priority) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); + struct mptcp_subflow_context *new_ctx; + + if (!subflow_req->mp_capable) { + subflow_ulp_fallback(newsk); + return; + } + + new_ctx = subflow_create_ctx(newsk, priority); + if (new_ctx == NULL) { + subflow_ulp_fallback(newsk); + return; + } + + new_ctx->conn_finished = 1; + new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->mp_capable = 1; + new_ctx->fourth_ack = 1; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; +} + static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, + .clone = subflow_ulp_clone, }; +static int subflow_ops_init(struct request_sock_ops *subflow_ops) +{ + subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); + subflow_ops->slab_name = "request_sock_subflow"; + + subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, + subflow_ops->obj_size, 0, + SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU, + NULL); + if (!subflow_ops->slab) + return -ENOMEM; + + return 0; +} + void mptcp_subflow_init(void) { + subflow_request_sock_ops = tcp_request_sock_ops; + if (subflow_ops_init(&subflow_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow request sock ops\n"); + + subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; + subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + + subflow_specific = ipv4_specific; + subflow_specific.conn_request = subflow_v4_conn_request; + subflow_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_specific.sk_rx_dst_set = subflow_finish_connect; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; + subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + + subflow_v6_specific = ipv6_specific; + subflow_v6_specific.conn_request = subflow_v6_conn_request; + subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; + subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + + subflow_v6m_specific = subflow_v6_specific; + subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; + subflow_v6m_specific.send_check = ipv4_specific.send_check; + subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; + subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; + subflow_v6m_specific.net_frag_header_len = 0; +#endif + if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } -- cgit v1.2.3 From cf7da0d66cc1a2a19fc5930bb746ffbb2d4cd1be Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:19 -0800 Subject: mptcp: Create SUBFLOW socket for incoming connections Add subflow_request_sock type that extends tcp_request_sock and add an is_mptcp flag to tcp_request_sock distinguish them. Override the listen() and accept() methods of the MPTCP socket proto_ops so they may act on the subflow socket. Override the conn_request() and syn_recv_sock() handlers in the inet_connection_sock to handle incoming MPTCP SYNs and the ACK to the response SYN. Add handling in tcp_output.c to add MP_CAPABLE to an outgoing SYN-ACK response for a subflow_request_sock. Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 231 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bdd58da1e4f6..e08a25eabcd5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -14,6 +14,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +#include +#endif #include #include "protocol.h" @@ -212,6 +215,90 @@ static void mptcp_close(struct sock *sk, long timeout) sk_common_release(sk); } +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); + struct ipv6_pinfo *msk6 = inet6_sk(msk); + + msk->sk_v6_daddr = ssk->sk_v6_daddr; + msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; + + if (msk6 && ssk6) { + msk6->saddr = ssk6->saddr; + msk6->flow_label = ssk6->flow_label; + } +#endif + + inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; + inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; + inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; + inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; + inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; + inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; +} + +static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, + bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *listener; + struct sock *newsk; + + listener = __mptcp_nmpc_socket(msk); + if (WARN_ON_ONCE(!listener)) { + *err = -EINVAL; + return NULL; + } + + pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); + newsk = inet_csk_accept(listener->sk, flags, err, kern); + if (!newsk) + return NULL; + + pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + + if (sk_is_mptcp(newsk)) { + struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + struct sock *ssk = newsk; + + subflow = mptcp_subflow_ctx(newsk); + lock_sock(sk); + + local_bh_disable(); + new_mptcp_sock = sk_clone_lock(sk, GFP_ATOMIC); + if (!new_mptcp_sock) { + *err = -ENOBUFS; + local_bh_enable(); + release_sock(sk); + tcp_close(newsk, 0); + return NULL; + } + + mptcp_init_sock(new_mptcp_sock); + + msk = mptcp_sk(new_mptcp_sock); + msk->remote_key = subflow->remote_key; + msk->local_key = subflow->local_key; + msk->subflow = NULL; + + newsk = new_mptcp_sock; + mptcp_copy_inaddrs(newsk, ssk); + list_add(&subflow->node, &msk->conn_list); + + /* will be fully established at mptcp_stream_accept() + * completion. + */ + inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); + bh_unlock_sock(new_mptcp_sock); + local_bh_enable(); + release_sock(sk); + } + + return newsk; +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -246,12 +333,21 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->local_key, subflow->local_key); } +static void mptcp_sock_graft(struct sock *sk, struct socket *parent) +{ + write_lock_bh(&sk->sk_callback_lock); + rcu_assign_pointer(sk->sk_wq, &parent->wq); + sk_set_socket(sk, parent); + sk->sk_uid = SOCK_INODE(parent)->i_uid; + write_unlock_bh(&sk->sk_callback_lock); +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, .close = mptcp_close, - .accept = inet_csk_accept, + .accept = mptcp_accept, .shutdown = tcp_shutdown, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, @@ -266,10 +362,7 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct socket *ssock; - int err = -ENOTSUPP; - - if (uaddr->sa_family != AF_INET) // @@ allow only IPv4 for now - return err; + int err; lock_sock(sock->sk); ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); @@ -279,6 +372,8 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } err = ssock->ops->bind(ssock, uaddr, addr_len); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); unlock: release_sock(sock->sk); @@ -299,14 +394,139 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto unlock; } +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0; +#endif + err = ssock->ops->connect(ssock, uaddr, addr_len, flags); inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + mptcp_copy_inaddrs(sock->sk, ssock->sk); unlock: release_sock(sock->sk); return err; } +static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcp_prot) { + /* we are being invoked from __sys_accept4, after + * mptcp_accept() has just accepted a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + } + + return inet_getname(sock, uaddr, peer); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + if (sock->sk->sk_prot == &tcpv6_prot) { + /* we are being invoked from __sys_accept4 after + * mptcp_accept() has accepted a non-mp-capable + * subflow: sk is a tcp_sk, not mptcp. + * + * Hand the socket over to tcp so all further + * socket ops bypass mptcp. + */ + sock->ops = &inet6_stream_ops; + } + + return inet6_getname(sock, uaddr, peer); +} +#endif + +static int mptcp_listen(struct socket *sock, int backlog) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + ssock = __mptcp_socket_create(msk, TCP_LISTEN); + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); + goto unlock; + } + + err = ssock->ops->listen(ssock, backlog); + inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); + if (!err) + mptcp_copy_inaddrs(sock->sk, ssock->sk); + +unlock: + release_sock(sock->sk); + return err; +} + +static bool is_tcp_proto(const struct proto *p) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + return p == &tcp_prot || p == &tcpv6_prot; +#else + return p == &tcp_prot; +#endif +} + +static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, + int flags, bool kern) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct socket *ssock; + int err; + + pr_debug("msk=%p", msk); + + lock_sock(sock->sk); + if (sock->sk->sk_state != TCP_LISTEN) + goto unlock_fail; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto unlock_fail; + + sock_hold(ssock->sk); + release_sock(sock->sk); + + err = ssock->ops->accept(sock, newsock, flags, kern); + if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) { + struct mptcp_sock *msk = mptcp_sk(newsock->sk); + struct mptcp_subflow_context *subflow; + + /* set ssk->sk_socket of accept()ed flows to mptcp socket. + * This is needed so NOSPACE flag can be set from tcp stack. + */ + list_for_each_entry(subflow, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!ssk->sk_socket) + mptcp_sock_graft(ssk, newsock); + } + + inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); + } + + sock_put(ssock->sk); + return err; + +unlock_fail: + release_sock(sock->sk); + return -EINVAL; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -332,6 +552,9 @@ void __init mptcp_init(void) mptcp_stream_ops.bind = mptcp_bind; mptcp_stream_ops.connect = mptcp_stream_connect; mptcp_stream_ops.poll = mptcp_poll; + mptcp_stream_ops.accept = mptcp_stream_accept; + mptcp_stream_ops.getname = mptcp_v4_getname; + mptcp_stream_ops.listen = mptcp_listen; mptcp_subflow_init(); @@ -371,6 +594,9 @@ int mptcpv6_init(void) mptcp_v6_stream_ops.bind = mptcp_bind; mptcp_v6_stream_ops.connect = mptcp_stream_connect; mptcp_v6_stream_ops.poll = mptcp_poll; + mptcp_v6_stream_ops.accept = mptcp_stream_accept; + mptcp_v6_stream_ops.getname = mptcp_v6_getname; + mptcp_v6_stream_ops.listen = mptcp_listen; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) -- cgit v1.2.3 From 79c0949e9a09f6a14a6dd18dc8396029423f9b68 Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:20 -0800 Subject: mptcp: Add key generation and token tree Generate the local keys, IDSN, and token when creating a new socket. Introduce the token tree to track all tokens in use using a radix tree with the MPTCP token itself as the index. Override the rebuild_header callback in inet_connection_sock_af_ops for creating the local key on a new outgoing connection. Override the init_req callback of tcp_request_sock_ops for creating the local key on a new incoming connection. Will be used to obtain the MPTCP parent socket to handle incoming joins. Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/Makefile | 2 +- net/mptcp/crypto.c | 122 ++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 16 +++++ net/mptcp/protocol.h | 32 +++++++++ net/mptcp/subflow.c | 69 ++++++++++++++++-- net/mptcp/token.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 428 insertions(+), 8 deletions(-) create mode 100644 net/mptcp/crypto.c create mode 100644 net/mptcp/token.c (limited to 'net') diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index e1ee5aade8b0..178ae81d8b66 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c new file mode 100644 index 000000000000..bbd6d01af211 --- /dev/null +++ b/net/mptcp/crypto.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP cryptographic functions + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and + * mptcp_ipv6 from multipath-tcp.org, authored by: + * + * SĂ©bastien BarrĂ© + * Christoph Paasch + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien DuchĂªne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + */ + +#include +#include +#include + +#include "protocol.h" + +struct sha1_state { + u32 workspace[SHA_WORKSPACE_WORDS]; + u32 digest[SHA_DIGEST_WORDS]; + unsigned int count; +}; + +static void sha1_init(struct sha1_state *state) +{ + sha_init(state->digest); + state->count = 0; +} + +static void sha1_update(struct sha1_state *state, u8 *input) +{ + sha_transform(state->digest, input, state->workspace); + state->count += SHA_MESSAGE_BYTES; +} + +static void sha1_pad_final(struct sha1_state *state, u8 *input, + unsigned int length, __be32 *mptcp_hashed_key) +{ + int i; + + input[length] = 0x80; + memset(&input[length + 1], 0, SHA_MESSAGE_BYTES - length - 9); + put_unaligned_be64((length + state->count) << 3, + &input[SHA_MESSAGE_BYTES - 8]); + + sha_transform(state->digest, input, state->workspace); + for (i = 0; i < SHA_DIGEST_WORDS; ++i) + put_unaligned_be32(state->digest[i], &mptcp_hashed_key[i]); + + memzero_explicit(state->workspace, SHA_WORKSPACE_WORDS << 2); +} + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) +{ + __be32 mptcp_hashed_key[SHA_DIGEST_WORDS]; + u8 input[SHA_MESSAGE_BYTES]; + struct sha1_state state; + + sha1_init(&state); + put_unaligned_be64(key, input); + sha1_pad_final(&state, input, 8, mptcp_hashed_key); + + if (token) + *token = be32_to_cpu(mptcp_hashed_key[0]); + if (idsn) + *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[3])); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + u32 *hash_out) +{ + u8 input[SHA_MESSAGE_BYTES * 2]; + struct sha1_state state; + u8 key1be[8]; + u8 key2be[8]; + int i; + + put_unaligned_be64(key1, key1be); + put_unaligned_be64(key2, key2be); + + /* Generate key xored with ipad */ + memset(input, 0x36, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + put_unaligned_be32(nonce1, &input[SHA_MESSAGE_BYTES]); + put_unaligned_be32(nonce2, &input[SHA_MESSAGE_BYTES + 4]); + + sha1_init(&state); + sha1_update(&state, input); + + /* emit sha256(K1 || msg) on the second input block, so we can + * reuse 'input' for the last hashing + */ + sha1_pad_final(&state, &input[SHA_MESSAGE_BYTES], 8, + (__force __be32 *)&input[SHA_MESSAGE_BYTES]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, SHA_MESSAGE_BYTES); + for (i = 0; i < 8; i++) + input[i] ^= key1be[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key2be[i]; + + sha1_init(&state); + sha1_update(&state, input); + sha1_pad_final(&state, &input[SHA_MESSAGE_BYTES], SHA_DIGEST_WORDS << 2, + (__be32 *)hash_out); +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e08a25eabcd5..3f66b6a3bb28 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -201,6 +201,7 @@ static void mptcp_close(struct sock *sk, long timeout) struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); + mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); lock_sock(sk); @@ -281,8 +282,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, msk = mptcp_sk(new_mptcp_sock); msk->remote_key = subflow->remote_key; msk->local_key = subflow->local_key; + msk->token = subflow->token; msk->subflow = NULL; + mptcp_token_update_accept(newsk, new_mptcp_sock); newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); @@ -299,6 +302,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return newsk; } +static void mptcp_destroy(struct sock *sk) +{ +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -331,6 +338,7 @@ void mptcp_finish_connect(struct sock *ssk) */ WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); + WRITE_ONCE(msk->token, subflow->token); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -349,6 +357,7 @@ static struct proto mptcp_prot = { .close = mptcp_close, .accept = mptcp_accept, .shutdown = tcp_shutdown, + .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, .hash = inet_hash, @@ -568,6 +577,12 @@ void __init mptcp_init(void) static struct proto_ops mptcp_v6_stream_ops; static struct proto mptcp_v6_prot; +static void mptcp_v6_destroy(struct sock *sk) +{ + mptcp_destroy(sk); + inet6_destroy_sock(sk); +} + static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, @@ -583,6 +598,7 @@ int mptcpv6_init(void) mptcp_v6_prot = mptcp_prot; strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; + mptcp_v6_prot.destroy = mptcp_v6_destroy; mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) + sizeof(struct ipv6_pinfo); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index bd66e7415515..5f43fa0275c0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -7,6 +7,10 @@ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H +#include +#include +#include + #define MPTCP_SUPPORTED_VERSION 0 /* MPTCP option bits */ @@ -42,6 +46,7 @@ struct mptcp_sock { struct inet_connection_sock sk; u64 local_key; u64 remote_key; + u32 token; struct list_head conn_list; struct socket *subflow; /* outgoing connect/listener/!mp_capable */ }; @@ -61,6 +66,8 @@ struct mptcp_subflow_request_sock { backup : 1; u64 local_key; u64 remote_key; + u64 idsn; + u32 token; }; static inline struct mptcp_subflow_request_sock * @@ -74,6 +81,8 @@ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ u64 local_key; u64 remote_key; + u64 idsn; + u32 token; u32 request_mptcp : 1, /* send MP_CAPABLE */ mp_capable : 1, /* remote is MPTCP capable */ fourth_ack : 1, /* send initial DSS */ @@ -112,4 +121,27 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); +int mptcp_token_new_request(struct request_sock *req); +void mptcp_token_destroy_request(u32 token); +int mptcp_token_new_connect(struct sock *sk); +int mptcp_token_new_accept(u32 token); +void mptcp_token_update_accept(struct sock *sk, struct sock *conn); +void mptcp_token_destroy(u32 token); + +void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); +static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) +{ + /* we might consider a faster version that computes the key as a + * hash of some information available in the MPTCP socket. Use + * random data at the moment, as it's probably the safest option + * in case multiple sockets are opened in different namespaces at + * the same time. + */ + get_random_bytes(key, sizeof(u64)); + mptcp_crypto_key_sha(*key, token, idsn); +} + +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + u32 *hash_out); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index df3192305967..89b91bc7a831 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 - 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include @@ -18,6 +20,33 @@ #include #include "protocol.h" +static int subflow_rebuild_header(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + int err = 0; + + if (subflow->request_mptcp && !subflow->token) { + pr_debug("subflow=%p", sk); + err = mptcp_token_new_connect(sk); + } + + if (err) + return err; + + return subflow->icsk_af_ops->rebuild_header(sk); +} + +static void subflow_req_destructor(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + + pr_debug("subflow_req=%p", subflow_req); + + if (subflow_req->mp_capable) + mptcp_token_destroy_request(subflow_req->token); + tcp_request_sock_ops.destructor(req); +} + static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -42,7 +71,12 @@ static void subflow_init_req(struct request_sock *req, #endif if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { - subflow_req->mp_capable = 1; + int err; + + err = mptcp_token_new_request(req); + if (err == 0) + subflow_req->mp_capable = 1; + subflow_req->remote_key = rx_opt.mptcp.sndr_key; } } @@ -150,16 +184,28 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, req_unhash, own_req); if (child && *own_req) { - if (!mptcp_subflow_ctx(child)) { - pr_debug("Closing child socket"); - inet_sk_set_state(child, TCP_CLOSE); - sock_set_flag(child, SOCK_DEAD); - inet_csk_destroy_sock(child); - child = NULL; + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + + /* we have null ctx on TCP fallback, not fatal on MPC + * handshake + */ + if (!ctx) + return child; + + if (ctx->mp_capable) { + if (mptcp_token_new_accept(ctx->token)) + goto close_child; } } return child; + +close_child: + pr_debug("closing child socket"); + tcp_send_active_reset(child, GFP_ATOMIC); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return NULL; } static struct inet_connection_sock_af_ops subflow_specific; @@ -224,6 +270,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) pr_debug("subflow=%p", subflow); *new_sock = sf; + sock_hold(sk); subflow->conn = sk; return 0; @@ -286,6 +333,9 @@ static void subflow_ulp_release(struct sock *sk) if (!ctx) return; + if (ctx->conn) + sock_put(ctx->conn); + kfree_rcu(ctx, rcu); } @@ -323,6 +373,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->fourth_ack = 1; new_ctx->remote_key = subflow_req->remote_key; new_ctx->local_key = subflow_req->local_key; + new_ctx->token = subflow_req->token; } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { @@ -346,6 +397,8 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) if (!subflow_ops->slab) return -ENOMEM; + subflow_ops->destructor = subflow_req_destructor; + return 0; } @@ -362,6 +415,7 @@ void mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; @@ -371,6 +425,7 @@ void mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; diff --git a/net/mptcp/token.c b/net/mptcp/token.c new file mode 100644 index 000000000000..84d887806090 --- /dev/null +++ b/net/mptcp/token.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP token management + * Copyright (c) 2017 - 2019, Intel Corporation. + * + * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org, + * authored by: + * + * SĂ©bastien BarrĂ© + * Christoph Paasch + * Jaakko Korkeaniemi + * Gregory Detal + * Fabien DuchĂªne + * Andreas Seelinger + * Lavkesh Lahngir + * Andreas Ripke + * Vlad Dogaru + * Octavian Purdila + * John Ronan + * Catalin Nicutar + * Brandon Heller + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "protocol.h" + +static RADIX_TREE(token_tree, GFP_ATOMIC); +static RADIX_TREE(token_req_tree, GFP_ATOMIC); +static DEFINE_SPINLOCK(token_tree_lock); +static int token_used __read_mostly; + +/** + * mptcp_token_new_request - create new key/idsn/token for subflow_request + * @req - the request socket + * + * This function is called when a new mptcp connection is coming in. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * Returns 0 on success. + */ +int mptcp_token_new_request(struct request_sock *req) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow_req->local_key, + &subflow_req->token, + &subflow_req->idsn); + pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", + req, subflow_req->local_key, subflow_req->token, + subflow_req->idsn); + + token = subflow_req->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + + err = radix_tree_insert(&token_req_tree, + subflow_req->token, &token_used); + spin_unlock_bh(&token_tree_lock); + return err; +} + +/** + * mptcp_token_new_connect - create new key/idsn/token for subflow + * @sk - the socket that will initiate a connection + * + * This function is called when a new outgoing mptcp connection is + * initiated. + * + * It creates a unique token to identify the new mptcp connection, + * a secret local key and the initial data sequence number (idsn). + * + * On success, the mptcp connection can be found again using + * the computed token at a later time, this is needed to process + * join requests. + * + * returns 0 on success. + */ +int mptcp_token_new_connect(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *mptcp_sock = subflow->conn; + int err; + + while (1) { + u32 token; + + mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, + &subflow->idsn); + + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + + token = subflow->token; + spin_lock_bh(&token_tree_lock); + if (!radix_tree_lookup(&token_req_tree, token) && + !radix_tree_lookup(&token_tree, token)) + break; + spin_unlock_bh(&token_tree_lock); + } + err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_new_accept - insert token for later processing + * @token: the token to insert to the tree + * + * Called when a SYN packet creates a new logical connection, i.e. + * is not a join request. + * + * We don't have an mptcp socket yet at that point. + * This is paired with mptcp_token_update_accept, called on accept(). + */ +int mptcp_token_new_accept(u32 token) +{ + int err; + + spin_lock_bh(&token_tree_lock); + err = radix_tree_insert(&token_tree, token, &token_used); + spin_unlock_bh(&token_tree_lock); + + return err; +} + +/** + * mptcp_token_update_accept - update token to map to mptcp socket + * @conn: the new struct mptcp_sock + * @sk: the initial subflow for this mptcp socket + * + * Called when the first mptcp socket is created on accept to + * refresh the dummy mapping (done to reserve the token) with + * the mptcp_socket structure that wasn't allocated before. + */ +void mptcp_token_update_accept(struct sock *sk, struct sock *conn) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + void __rcu **slot; + + spin_lock_bh(&token_tree_lock); + slot = radix_tree_lookup_slot(&token_tree, subflow->token); + WARN_ON_ONCE(!slot); + if (slot) { + WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used); + radix_tree_replace_slot(&token_tree, slot, conn); + } + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy_request - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove not-yet-fully-established incoming connection identified + * by @token. + */ +void mptcp_token_destroy_request(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_req_tree, token); + spin_unlock_bh(&token_tree_lock); +} + +/** + * mptcp_token_destroy - remove mptcp connection/token + * @token - token of mptcp connection to remove + * + * Remove the connection identified by @token. + */ +void mptcp_token_destroy(u32 token) +{ + spin_lock_bh(&token_tree_lock); + radix_tree_delete(&token_tree, token); + spin_unlock_bh(&token_tree_lock); +} -- cgit v1.2.3 From 214984901aaf50cc52eebeabf14017edeb3a845f Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:21 -0800 Subject: mptcp: Add shutdown() socket operation Call shutdown on all subflows in use on the given socket, or on the fallback socket. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3f66b6a3bb28..249b2506a66a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -196,6 +196,29 @@ static int mptcp_init_sock(struct sock *sk) return 0; } +static void mptcp_subflow_shutdown(struct sock *ssk, int how) +{ + lock_sock(ssk); + + switch (ssk->sk_state) { + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* fall through */ + case TCP_SYN_SENT: + tcp_disconnect(ssk, O_NONBLOCK); + break; + default: + ssk->sk_shutdown |= how; + tcp_shutdown(ssk, how); + break; + } + + /* Wake up anyone sleeping in poll. */ + ssk->sk_state_change(ssk); + release_sock(ssk); +} + static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; @@ -273,6 +296,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, *err = -ENOBUFS; local_bh_enable(); release_sock(sk); + mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1); tcp_close(newsk, 0); return NULL; } @@ -544,6 +568,46 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } +static int mptcp_shutdown(struct socket *sock, int how) +{ + struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct mptcp_subflow_context *subflow; + int ret = 0; + + pr_debug("sk=%p, how=%d", msk, how); + + lock_sock(sock->sk); + + if (how == SHUT_WR || how == SHUT_RDWR) + inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); + + how++; + + if ((how & ~SHUTDOWN_MASK) || !how) { + ret = -EINVAL; + goto out_unlock; + } + + if (sock->state == SS_CONNECTING) { + if ((1 << sock->sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + mptcp_subflow_shutdown(tcp_sk, how); + } + +out_unlock: + release_sock(sock->sk); + + return ret; +} + static struct proto_ops mptcp_stream_ops; static struct inet_protosw mptcp_protosw = { @@ -564,6 +628,7 @@ void __init mptcp_init(void) mptcp_stream_ops.accept = mptcp_stream_accept; mptcp_stream_ops.getname = mptcp_v4_getname; mptcp_stream_ops.listen = mptcp_listen; + mptcp_stream_ops.shutdown = mptcp_shutdown; mptcp_subflow_init(); @@ -613,6 +678,7 @@ int mptcpv6_init(void) mptcp_v6_stream_ops.accept = mptcp_stream_accept; mptcp_v6_stream_ops.getname = mptcp_v6_getname; mptcp_v6_stream_ops.listen = mptcp_listen; + mptcp_v6_stream_ops.shutdown = mptcp_shutdown; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) -- cgit v1.2.3 From 717e79c867ca5f27815578815feafa3f3944f06b Mon Sep 17 00:00:00 2001 From: Peter Krystad Date: Tue, 21 Jan 2020 16:56:22 -0800 Subject: mptcp: Add setsockopt()/getsockopt() socket operations set/getsockopt behaviour with multiple subflows is undefined. Therefore, for now, we return -EOPNOTSUPP unless we're in fallback mode. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Peter Krystad Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 249b2506a66a..875ca48de4b2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -330,6 +330,62 @@ static void mptcp_destroy(struct sock *sk) { } +static int mptcp_setsockopt(struct sock *sk, int level, int optname, + char __user *uoptval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + char __kernel *optval; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + /* will be treated as __user in tcp_setsockopt */ + optval = (char __kernel __force *)uoptval; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (!IS_ERR(ssock)) { + pr_debug("subflow=%p", ssock->sk); + ret = kernel_setsockopt(ssock, level, optname, optval, optlen); + } + release_sock(sk); + + return ret; +} + +static int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *uoptval, int __user *uoption) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + char __kernel *optval; + int ret = -EOPNOTSUPP; + int __kernel *option; + struct socket *ssock; + + /* will be treated as __user in tcp_getsockopt */ + optval = (char __kernel __force *)uoptval; + option = (int __kernel __force *)uoption; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of getsockopt() when the socket is connected and + * there are multiple subflows is not defined. + */ + lock_sock(sk); + ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); + if (!IS_ERR(ssock)) { + pr_debug("subflow=%p", ssock->sk); + ret = kernel_getsockopt(ssock, level, optname, optval, option); + } + release_sock(sk); + + return ret; +} + static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -380,6 +436,8 @@ static struct proto mptcp_prot = { .init = mptcp_init_sock, .close = mptcp_close, .accept = mptcp_accept, + .setsockopt = mptcp_setsockopt, + .getsockopt = mptcp_getsockopt, .shutdown = tcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, -- cgit v1.2.3 From 6d0060f600adfddaa43fefb96b6b12643331961e Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:23 -0800 Subject: mptcp: Write MPTCP DSS headers to outgoing data packets Per-packet metadata required to write the MPTCP DSS option is written to the skb_ext area. One write to the socket may contain more than one packet of data, which is copied to page fragments and mapped in to MPTCP DSS segments with size determined by the available page fragments and the maximum mapping length allowed by the MPTCP specification. If do_tcp_sendpages() splits a DSS segment in to multiple skbs, that's ok - the later skbs can either have duplicated DSS mapping information or none at all, and the receiver can handle that. The current implementation uses the subflow frag cache and tcp sendpages to avoid excessive code duplication. More work is required to ensure that it works correctly under memory pressure and to support MPTCP-level retransmissions. The MPTCP DSS checksum is not yet implemented. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/mptcp.h | 1 + net/mptcp/options.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++-- net/mptcp/protocol.c | 116 +++++++++++++++++++++++++++++++++++++- net/mptcp/protocol.h | 20 +++++++ 4 files changed, 286 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index eabc57c3fde4..06dcc665135e 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -32,6 +32,7 @@ struct mptcp_out_options { u16 suboptions; u64 sndr_key; u64 rcvr_key; + struct mptcp_ext ext_copy; #endif }; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 52ff2301b68b..5ea127bc7f01 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -134,13 +134,13 @@ void mptcp_rcv_synsent(struct sock *sk) } } -bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, - unsigned int *size, unsigned int remaining, - struct mptcp_out_options *opts) +static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - if (subflow->mp_capable && !subflow->fourth_ack) { + if (!subflow->fourth_ack) { opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; @@ -153,6 +153,112 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, return false; } +static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, + struct mptcp_ext *ext) +{ + ext->data_fin = 1; + + if (!ext->use_map) { + /* RFC6824 requires a DSS mapping with specific values + * if DATA_FIN is set but no data payload is mapped + */ + ext->use_map = 1; + ext->dsn64 = 1; + ext->data_seq = mptcp_sk(subflow->conn)->write_seq; + ext->subflow_seq = 0; + ext->data_len = 1; + } else { + /* If there's an existing DSS mapping, DATA_FIN consumes + * 1 additional byte of mapping space. + */ + ext->data_len++; + } +} + +static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + unsigned int dss_size = 0; + struct mptcp_ext *mpext; + struct mptcp_sock *msk; + unsigned int ack_size; + u8 tcp_fin; + + if (skb) { + mpext = mptcp_get_ext(skb); + tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; + } else { + mpext = NULL; + tcp_fin = 0; + } + + if (!skb || (mpext && mpext->use_map) || tcp_fin) { + unsigned int map_size; + + map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + + remaining -= map_size; + dss_size = map_size; + if (mpext) + opts->ext_copy = *mpext; + + if (skb && tcp_fin && + subflow->conn->sk_state != TCP_ESTABLISHED) + mptcp_write_data_fin(subflow, &opts->ext_copy); + } + + ack_size = TCPOLEN_MPTCP_DSS_ACK64; + + /* Add kind/length/subtype/flag overhead if mapping is not populated */ + if (dss_size == 0) + ack_size += TCPOLEN_MPTCP_DSS_BASE; + + dss_size += ack_size; + + msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn); + if (msk) { + opts->ext_copy.data_ack = msk->ack_seq; + } else { + mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key, + NULL, &opts->ext_copy.data_ack); + opts->ext_copy.data_ack++; + } + + opts->ext_copy.ack64 = 1; + opts->ext_copy.use_ack = 1; + + *size = ALIGN(dss_size, 4); + return true; +} + +bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, + struct mptcp_out_options *opts) +{ + unsigned int opt_size = 0; + bool ret = false; + + if (mptcp_established_options_mp(sk, &opt_size, remaining, opts)) + ret = true; + else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, + opts)) + ret = true; + + /* we reserved enough space for the above options, and exceeding the + * TCP option space would be fatal + */ + if (WARN_ON_ONCE(opt_size > remaining)) + return false; + + *size += opt_size; + remaining -= opt_size; + + return ret; +} + bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { @@ -194,4 +300,45 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) ptr += 2; } } + + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { + struct mptcp_ext *mpext = &opts->ext_copy; + u8 len = TCPOLEN_MPTCP_DSS_BASE; + u8 flags = 0; + + if (mpext->use_ack) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; + } + + if (mpext->use_map) { + len += TCPOLEN_MPTCP_DSS_MAP64; + + /* Use only 64-bit mapping flags for now, add + * support for optional 32-bit mappings later. + */ + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; + if (mpext->data_fin) + flags |= MPTCP_DSS_DATA_FIN; + } + + *ptr++ = htonl((TCPOPT_MPTCP << 24) | + (len << 16) | + (MPTCPOPT_DSS << 12) | + (flags)); + + if (mpext->use_ack) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } + + if (mpext->use_map) { + put_unaligned_be64(mpext->data_seq, ptr); + ptr += 2; + put_unaligned_be32(mpext->subflow_seq, ptr); + ptr += 1; + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } + } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 875ca48de4b2..8cf49193b1c0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -97,12 +97,93 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) return NULL; } +static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) +{ + if (!msk->cached_ext) + msk->cached_ext = __skb_ext_alloc(); + + return !!msk->cached_ext; +} + +static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, + struct msghdr *msg, long *timeo) +{ + int mss_now = 0, size_goal = 0, ret = 0; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_ext *mpext = NULL; + struct page_frag *pfrag; + struct sk_buff *skb; + size_t psize; + + /* use the mptcp page cache so that we can easily move the data + * from one substream to another, but do per subflow memory accounting + */ + pfrag = sk_page_frag(sk); + while (!sk_page_frag_refill(ssk, pfrag) || + !mptcp_ext_cache_refill(msk)) { + ret = sk_stream_wait_memory(ssk, timeo); + if (ret) + return ret; + } + + /* compute copy limit */ + mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); + psize = min_t(int, pfrag->size - pfrag->offset, size_goal); + + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, pfrag->offset, + min_t(size_t, msg_data_left(msg), psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + /* Mark the end of the previous write so the beginning of the + * next write (with its own mptcp skb extension data) is not + * collapsed. + */ + skb = tcp_write_queue_tail(ssk); + if (skb) + TCP_SKB_CB(skb)->eor = 1; + + ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + msg->msg_flags | MSG_SENDPAGE_NOTLAST); + if (ret <= 0) + return ret; + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + skb = tcp_write_queue_tail(ssk); + mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); + msk->cached_ext = NULL; + + memset(mpext, 0, sizeof(*mpext)); + mpext->data_seq = msk->write_seq; + mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; + mpext->data_len = ret; + mpext->use_map = 1; + mpext->dsn64 = 1; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", + mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->dsn64); + + pfrag->offset += ret; + msk->write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + + tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); + return ret; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct socket *ssock; + size_t copied = 0; struct sock *ssk; - int ret; + int ret = 0; + long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; @@ -116,14 +197,29 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return ret; } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + ssk = mptcp_subflow_get(msk); if (!ssk) { release_sock(sk); return -ENOTCONN; } - ret = sock_sendmsg(ssk->sk_socket, msg); + pr_debug("conn_list->subflow=%p", ssk); + lock_sock(ssk); + while (msg_data_left(msg)) { + ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo); + if (ret < 0) + break; + + copied += ret; + } + + if (copied > 0) + ret = copied; + + release_sock(ssk); release_sock(sk); return ret; } @@ -235,6 +331,8 @@ static void mptcp_close(struct sock *sk, long timeout) __mptcp_close_ssk(sk, ssk, subflow, timeout); } + if (msk->cached_ext) + __skb_ext_put(msk->cached_ext); release_sock(sk); sk_common_release(sk); } @@ -286,6 +384,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; struct sock *ssk = newsk; + u64 ack_seq; subflow = mptcp_subflow_ctx(newsk); lock_sock(sk); @@ -310,6 +409,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, msk->subflow = NULL; mptcp_token_update_accept(newsk, new_mptcp_sock); + + mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); + msk->write_seq = subflow->idsn + 1; + ack_seq++; + msk->ack_seq = ack_seq; + subflow->rel_write_seq = 1; newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); @@ -404,6 +509,7 @@ void mptcp_finish_connect(struct sock *ssk) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; + u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); @@ -413,12 +519,18 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); + ack_seq++; + subflow->rel_write_seq = 1; + /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->token, subflow->token); + WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->ack_seq, ack_seq); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5f43fa0275c0..384ec4804198 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -32,6 +32,10 @@ #define TCPOLEN_MPTCP_MPC_SYN 12 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 +#define TCPOLEN_MPTCP_DSS_BASE 4 +#define TCPOLEN_MPTCP_DSS_ACK64 8 +#define TCPOLEN_MPTCP_DSS_MAP64 14 +#define TCPOLEN_MPTCP_DSS_CHECKSUM 2 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) @@ -40,14 +44,24 @@ #define MPTCP_CAP_HMAC_SHA1 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x3F) +/* MPTCP DSS flags */ +#define MPTCP_DSS_DATA_FIN BIT(4) +#define MPTCP_DSS_DSN64 BIT(3) +#define MPTCP_DSS_HAS_MAP BIT(2) +#define MPTCP_DSS_ACK64 BIT(1) +#define MPTCP_DSS_HAS_ACK BIT(0) + /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; u64 local_key; u64 remote_key; + u64 write_seq; + u64 ack_seq; u32 token; struct list_head conn_list; + struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ }; @@ -83,6 +97,7 @@ struct mptcp_subflow_context { u64 remote_key; u64 idsn; u32 token; + u32 rel_write_seq; u32 request_mptcp : 1, /* send MP_CAPABLE */ mp_capable : 1, /* remote is MPTCP capable */ fourth_ack : 1, /* send initial DSS */ @@ -144,4 +159,9 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, u32 *hash_out); +static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +{ + return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); +} + #endif /* __MPTCP_PROTOCOL_H */ -- cgit v1.2.3 From 648ef4b88673dadb8463bf0d4b10fbf33d55def8 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 21 Jan 2020 16:56:24 -0800 Subject: mptcp: Implement MPTCP receive path Parses incoming DSS options and populates outgoing MPTCP ACK fields. MPTCP fields are parsed from the TCP option header and placed in an skb extension, allowing the upper MPTCP layer to access MPTCP options after the skb has gone through the TCP stack. The subflow implements its own data_ready() ops, which ensures that the pending data is in sequence - according to MPTCP seq number - dropping out-of-seq skbs. The DATA_READY bit flag is set if this is the case. This allows the MPTCP socket layer to determine if more data is available without having to consult the individual subflows. It additionally validates the current mapping and propagates EoF events to the connection socket. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 10 ++ include/net/mptcp.h | 8 ++ net/ipv4/tcp_input.c | 8 +- net/mptcp/options.c | 107 ++++++++++++++ net/mptcp/protocol.c | 34 +++++ net/mptcp/protocol.h | 64 ++++++++- net/mptcp/subflow.c | 383 ++++++++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 609 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e9ee06d887fa..0d00dad4b85d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -82,9 +82,19 @@ struct tcp_sack_block { struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; u8 mp_capable : 1, mp_join : 1, dss : 1; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + __unused:3; }; #endif diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 06dcc665135e..8619c1fca741 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -60,6 +60,8 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx); void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); @@ -150,6 +152,12 @@ static inline bool mptcp_established_options(struct sock *sk, return false; } +static inline void mptcp_incoming_options(struct sock *sk, + struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ +} + static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5165c8de47ee..28d31f2c1422 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4770,6 +4770,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; @@ -6346,8 +6349,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); break; + } /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 5ea127bc7f01..1fa8496f3551 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -14,6 +14,7 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, { struct mptcp_options_received *mp_opt = &opt_rx->mptcp; u8 subtype = *ptr >> 4; + int expected_opsize; u8 version; u8 flags; @@ -64,7 +65,79 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, case MPTCPOPT_DSS: pr_debug("DSS"); + ptr++; + + flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; + mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; + mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; + mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; + mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; + mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); + + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + mp_opt->data_fin, mp_opt->dsn64, + mp_opt->use_map, mp_opt->ack64, + mp_opt->use_ack); + + expected_opsize = TCPOLEN_MPTCP_DSS_BASE; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) + expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) + expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; + else + expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; + } + + /* RFC 6824, Section 3.3: + * If a checksum is present, but its use had + * not been negotiated in the MP_CAPABLE handshake, + * the checksum field MUST be ignored. + */ + if (opsize != expected_opsize && + opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) + break; + mp_opt->dss = 1; + + if (mp_opt->use_ack) { + if (mp_opt->ack64) { + mp_opt->data_ack = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_ack = get_unaligned_be32(ptr); + ptr += 4; + } + + pr_debug("data_ack=%llu", mp_opt->data_ack); + } + + if (mp_opt->use_map) { + if (mp_opt->dsn64) { + mp_opt->data_seq = get_unaligned_be64(ptr); + ptr += 8; + } else { + mp_opt->data_seq = get_unaligned_be32(ptr); + ptr += 4; + } + + mp_opt->subflow_seq = get_unaligned_be32(ptr); + ptr += 4; + + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + mp_opt->data_seq, mp_opt->subflow_seq, + mp_opt->data_len); + } + break; default: @@ -275,6 +348,40 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return false; } +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *opt_rx) +{ + struct mptcp_options_received *mp_opt; + struct mptcp_ext *mpext; + + mp_opt = &opt_rx->mptcp; + + if (!mp_opt->dss) + return; + + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + if (!mpext) + return; + + memset(mpext, 0, sizeof(*mpext)); + + if (mp_opt->use_map) { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->data_len = mp_opt->data_len; + mpext->use_map = 1; + mpext->dsn64 = mp_opt->dsn64; + } + + if (mp_opt->use_ack) { + mpext->data_ack = mp_opt->data_ack; + mpext->use_ack = 1; + mpext->ack64 = mp_opt->ack64; + } + + mpext->data_fin = mp_opt->data_fin; +} + void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cf49193b1c0..71250149180b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -224,6 +224,33 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return ret; } +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct mptcp_read_arg *arg = desc->arg.data; + size_t copy_len; + + copy_len = min(desc->count, len); + + if (likely(arg->msg)) { + int err; + + err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len); + if (err) { + pr_debug("error path"); + desc->error = err; + return err; + } + } else { + pr_debug("Flushing skb payload"); + } + + desc->count -= copy_len; + + pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count); + return copy_len; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -414,7 +441,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, msk->write_seq = subflow->idsn + 1; ack_seq++; msk->ack_seq = ack_seq; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; subflow->rel_write_seq = 1; + subflow->tcp_sock = ssk; newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); @@ -519,8 +549,12 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); + pr_debug("msk=%p, token=%u", sk, subflow->token); + mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); ack_seq++; + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; subflow->rel_write_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 384ec4804198..c6d8217e24d4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -33,7 +33,9 @@ #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_DSS_BASE 4 +#define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 +#define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 @@ -50,6 +52,10 @@ #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) +#define MPTCP_DSS_FLAG_MASK (0x1F) + +/* MPTCP socket flags */ +#define MPTCP_DATA_READY BIT(0) /* MPTCP connection sock */ struct mptcp_sock { @@ -60,6 +66,7 @@ struct mptcp_sock { u64 write_seq; u64 ack_seq; u32 token; + unsigned long flags; struct list_head conn_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ @@ -82,6 +89,7 @@ struct mptcp_subflow_request_sock { u64 remote_key; u64 idsn; u32 token; + u32 ssn_offset; }; static inline struct mptcp_subflow_request_sock * @@ -96,15 +104,27 @@ struct mptcp_subflow_context { u64 local_key; u64 remote_key; u64 idsn; + u64 map_seq; u32 token; u32 rel_write_seq; + u32 map_subflow_seq; + u32 ssn_offset; + u32 map_data_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ mp_capable : 1, /* remote is MPTCP capable */ fourth_ack : 1, /* send initial DSS */ - conn_finished : 1; + conn_finished : 1, + map_valid : 1, + data_avail : 1, + rx_eof : 1; + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; + void (*tcp_data_ready)(struct sock *sk); + void (*tcp_state_change)(struct sock *sk); + void (*tcp_write_space)(struct sock *sk); + struct rcu_head rcu; }; @@ -123,14 +143,49 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) return subflow->tcp_sock; } +static inline u64 +mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) +{ + return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - + subflow->ssn_offset - + subflow->map_subflow_seq; +} + +static inline u64 +mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) +{ + return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); +} + +int mptcp_is_enabled(struct net *net); +bool mptcp_subflow_data_available(struct sock *sk); void mptcp_subflow_init(void); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); +static inline void mptcp_subflow_tcp_fallback(struct sock *sk, + struct mptcp_subflow_context *ctx) +{ + sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_state_change = ctx->tcp_state_change; + sk->sk_write_space = ctx->tcp_write_space; + + inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; +} + extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_MPTCP_IPV6) extern const struct inet_connection_sock_af_ops ipv6_specific; #endif +void mptcp_proto_init(void); + +struct mptcp_read_arg { + struct msghdr *msg; +}; + +int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len); + void mptcp_get_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx); @@ -164,4 +219,11 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 89b91bc7a831..528351e26371 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -78,6 +78,7 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->remote_key = rx_opt.mptcp.sndr_key; + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } } @@ -116,6 +117,11 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key); mptcp_finish_connect(sk); subflow->conn_finished = 1; + + if (skb) { + pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + } } } @@ -210,6 +216,323 @@ close_child: static struct inet_connection_sock_af_ops subflow_specific; +enum mapping_status { + MAPPING_OK, + MAPPING_INVALID, + MAPPING_EMPTY, + MAPPING_DATA_FIN +}; + +static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) +{ + if ((u32)seq == (u32)old_seq) + return old_seq; + + /* Assume map covers data not mapped yet. */ + return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); +} + +static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +{ + WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); +} + +static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned int skb_consumed; + + skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(skb_consumed >= skb->len)) + return true; + + return skb->len - skb_consumed <= subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); +} + +static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + + if (unlikely(before(ssn, subflow->map_subflow_seq))) { + /* Mapping covers data later in the subflow stream, + * currently unsupported. + */ + warn_bad_map(subflow, ssn); + return false; + } + if (unlikely(!before(ssn, subflow->map_subflow_seq + + subflow->map_data_len))) { + /* Mapping does covers past subflow data, invalid */ + warn_bad_map(subflow, ssn + skb->len); + return false; + } + return true; +} + +static enum mapping_status get_mapping_status(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_ext *mpext; + struct sk_buff *skb; + u16 data_len; + u64 map_seq; + + skb = skb_peek(&ssk->sk_receive_queue); + if (!skb) + return MAPPING_EMPTY; + + mpext = mptcp_get_ext(skb); + if (!mpext || !mpext->use_map) { + if (!subflow->map_valid && !skb->len) { + /* the TCP stack deliver 0 len FIN pkt to the receive + * queue, that is the only 0len pkts ever expected here, + * and we can admit no mapping only for 0 len pkts + */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + WARN_ONCE(1, "0len seq %d:%d flags %x", + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + TCP_SKB_CB(skb)->tcp_flags); + sk_eat_skb(ssk, skb); + return MAPPING_EMPTY; + } + + if (!subflow->map_valid) + return MAPPING_INVALID; + + goto validate_seq; + } + + pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", + mpext->data_seq, mpext->dsn64, mpext->subflow_seq, + mpext->data_len, mpext->data_fin); + + data_len = mpext->data_len; + if (data_len == 0) { + pr_err("Infinite mapping not handled"); + return MAPPING_INVALID; + } + + if (mpext->data_fin == 1) { + if (data_len == 1) { + pr_debug("DATA_FIN with no payload"); + if (subflow->map_valid) { + /* A DATA_FIN might arrive in a DSS + * option before the previous mapping + * has been fully consumed. Continue + * handling the existing mapping. + */ + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } else { + return MAPPING_DATA_FIN; + } + } + + /* Adjust for DATA_FIN using 1 byte of sequence space */ + data_len--; + } + + if (!mpext->dsn64) { + map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, + mpext->data_seq); + pr_debug("expanded seq=%llu", subflow->map_seq); + } else { + map_seq = mpext->data_seq; + } + + if (subflow->map_valid) { + /* Allow replacing only with an identical map */ + if (subflow->map_seq == map_seq && + subflow->map_subflow_seq == mpext->subflow_seq && + subflow->map_data_len == data_len) { + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; + } + + /* If this skb data are fully covered by the current mapping, + * the new map would need caching, which is not supported + */ + if (skb_is_fully_mapped(ssk, skb)) + return MAPPING_INVALID; + + /* will validate the next map after consuming the current one */ + return MAPPING_OK; + } + + subflow->map_seq = map_seq; + subflow->map_subflow_seq = mpext->subflow_seq; + subflow->map_data_len = data_len; + subflow->map_valid = 1; + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_seq, subflow->map_subflow_seq, + subflow->map_data_len); + +validate_seq: + /* we revalidate valid mapping on new skb, because we must ensure + * the current skb is completely covered by the available mapping + */ + if (!validate_mapping(ssk, skb)) + return MAPPING_INVALID; + + skb_ext_del(skb, SKB_EXT_MPTCP); + return MAPPING_OK; +} + +static bool subflow_check_data_avail(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + enum mapping_status status; + struct mptcp_sock *msk; + struct sk_buff *skb; + + pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, + subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); + if (subflow->data_avail) + return true; + + if (!subflow->conn) + return false; + + msk = mptcp_sk(subflow->conn); + for (;;) { + u32 map_remaining; + size_t delta; + u64 ack_seq; + u64 old_ack; + + status = get_mapping_status(ssk); + pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + if (status == MAPPING_INVALID) { + ssk->sk_err = EBADMSG; + goto fatal; + } + + if (status != MAPPING_OK) + return false; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return false; + + old_ack = READ_ONCE(msk->ack_seq); + ack_seq = mptcp_subflow_get_mapped_dsn(subflow); + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + ack_seq); + if (ack_seq == old_ack) + break; + + /* only accept in-sequence mapping. Old values are spurious + * retransmission; we can hit "future" values on active backup + * subflow switch, we relay on retransmissions to get + * in-sequence data. + * Cuncurrent subflows support will require subflow data + * reordering + */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + if (before64(ack_seq, old_ack)) + delta = min_t(size_t, old_ack - ack_seq, map_remaining); + else + delta = min_t(size_t, ack_seq - old_ack, map_remaining); + + /* discard mapped data */ + pr_debug("discarding %zu bytes, current map len=%d", delta, + map_remaining); + if (delta) { + struct mptcp_read_arg arg = { + .msg = NULL, + }; + read_descriptor_t desc = { + .count = delta, + .arg.data = &arg, + }; + int ret; + + ret = tcp_read_sock(ssk, &desc, mptcp_read_actor); + if (ret < 0) { + ssk->sk_err = -ret; + goto fatal; + } + if (ret < delta) + return false; + if (delta == map_remaining) + subflow->map_valid = 0; + } + } + return true; + +fatal: + /* fatal protocol error, close the socket */ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + ssk->sk_error_report(ssk); + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + return false; +} + +bool mptcp_subflow_data_available(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sk_buff *skb; + + /* check if current mapping is still valid */ + if (subflow->map_valid && + mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { + subflow->map_valid = 0; + subflow->data_avail = 0; + + pr_debug("Done with mapping: seq=%u data_len=%u", + subflow->map_subflow_seq, + subflow->map_data_len); + } + + if (!subflow_check_data_avail(sk)) { + subflow->data_avail = 0; + return false; + } + + skb = skb_peek(&sk->sk_receive_queue); + subflow->data_avail = skb && + before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq); + return subflow->data_avail; +} + +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + if (!parent || !subflow->mp_capable) { + subflow->tcp_data_ready(sk); + + if (parent) + parent->sk_data_ready(parent); + return; + } + + if (mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } +} + +static void subflow_write_space(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; + + sk_stream_write_space(sk); + if (parent && sk_stream_is_writeable(sk)) { + sk_stream_write_space(parent); + } +} + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@ -296,6 +619,47 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, return ctx; } +static void __subflow_state_change(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +static void subflow_state_change(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = READ_ONCE(subflow->conn); + + __subflow_state_change(sk); + + /* as recvmsg() does not acquire the subflow socket for ssk selection + * a fin packet carrying a DSS can be unnoticed if we don't trigger + * the data available machinery here. + */ + if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) { + set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags); + + parent->sk_data_ready(parent); + } + + if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) && + !subflow->rx_eof && subflow_is_done(sk)) { + subflow->rx_eof = 1; + parent->sk_shutdown |= RCV_SHUTDOWN; + __subflow_state_change(parent); + } +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -322,6 +686,12 @@ static int subflow_ulp_init(struct sock *sk) tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); + ctx->tcp_data_ready = sk->sk_data_ready; + ctx->tcp_state_change = sk->sk_state_change; + ctx->tcp_write_space = sk->sk_write_space; + sk->sk_data_ready = subflow_data_ready; + sk->sk_write_space = subflow_write_space; + sk->sk_state_change = subflow_state_change; out: return err; } @@ -339,10 +709,12 @@ static void subflow_ulp_release(struct sock *sk) kfree_rcu(ctx, rcu); } -static void subflow_ulp_fallback(struct sock *sk) +static void subflow_ulp_fallback(struct sock *sk, + struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); + mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; @@ -357,23 +729,28 @@ static void subflow_ulp_clone(const struct request_sock *req, struct mptcp_subflow_context *new_ctx; if (!subflow_req->mp_capable) { - subflow_ulp_fallback(newsk); + subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (new_ctx == NULL) { - subflow_ulp_fallback(newsk); + subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; + new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; + new_ctx->tcp_state_change = old_ctx->tcp_state_change; + new_ctx->tcp_write_space = old_ctx->tcp_write_space; new_ctx->mp_capable = 1; new_ctx->fourth_ack = 1; new_ctx->remote_key = subflow_req->remote_key; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { -- cgit v1.2.3 From 1891c4a07672b300b3160ceec59755063b20222b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 Jan 2020 16:56:25 -0800 Subject: mptcp: add subflow write space signalling and mptcp_poll Add new SEND_SPACE flag to indicate that a subflow has enough space to accept more data for transmission. It gets cleared at the end of mptcp_sendmsg() in case ssk has run below the free watermark. It is (re-set) from the wspace callback. This allows us to use msk->flags to determine the poll mask. Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Signed-off-by: Florian Westphal Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 3 +++ 3 files changed, 57 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 71250149180b..408efbe34753 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -176,6 +176,23 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return ret; } +static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) +{ + struct socket *sock; + + if (likely(sk_stream_is_writeable(ssk))) + return; + + sock = READ_ONCE(ssk->sk_socket); + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + /* set NOSPACE only after clearing SEND_SPACE flag */ + set_bit(SOCK_NOSPACE, &sock->flags); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -219,6 +236,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copied > 0) ret = copied; + ssk_check_wmem(msk, ssk); release_sock(ssk); release_sock(sk); return ret; @@ -315,6 +333,7 @@ static int mptcp_init_sock(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); INIT_LIST_HEAD(&msk->conn_list); + __set_bit(MPTCP_SEND_SPACE, &msk->flags); return 0; } @@ -576,6 +595,13 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } +static bool mptcp_memory_free(const struct sock *sk, int wake) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -591,6 +617,7 @@ static struct proto mptcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = mptcp_get_port, + .stream_memory_free = mptcp_memory_free, .obj_size = sizeof(struct mptcp_sock), .no_autobind = true, }; @@ -767,8 +794,34 @@ unlock_fail: static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { + const struct mptcp_sock *msk; + struct sock *sk = sock->sk; + struct socket *ssock; __poll_t mask = 0; + msk = mptcp_sk(sk); + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (ssock) { + mask = ssock->ops->poll(file, ssock, wait); + release_sock(sk); + return mask; + } + + release_sock(sk); + sock_poll_wait(file, sock, wait); + lock_sock(sk); + + if (test_bit(MPTCP_DATA_READY, &msk->flags)) + mask = EPOLLIN | EPOLLRDNORM; + if (sk_stream_is_writeable(sk) && + test_bit(MPTCP_SEND_SPACE, &msk->flags)) + mask |= EPOLLOUT | EPOLLWRNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + + release_sock(sk); + return mask; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index c6d8217e24d4..59a83eb64d37 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -56,6 +56,7 @@ /* MPTCP socket flags */ #define MPTCP_DATA_READY BIT(0) +#define MPTCP_SEND_SPACE BIT(1) /* MPTCP connection sock */ struct mptcp_sock { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 528351e26371..9fb3eb87a20f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -529,6 +529,9 @@ static void subflow_write_space(struct sock *sk) sk_stream_write_space(sk); if (parent && sk_stream_is_writeable(sk)) { + set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); + smp_mb__after_atomic(); + /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ sk_stream_write_space(parent); } } -- cgit v1.2.3 From 7a6a6cbc3e592e339ad23e4e8ede9a3f6160bda8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 21 Jan 2020 16:56:26 -0800 Subject: mptcp: recvmsg() can drain data from multiple subflows With the previous patch in place, the msk can detect which subflow has the current map with a simple walk, let's update the main loop to always select the 'current' subflow. The exit conditions now closely mirror tcp_recvmsg() to get expected timeout and signal behavior. Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Co-developed-by: Davide Caratti Signed-off-by: Davide Caratti Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Mat Martineau Signed-off-by: Mat Martineau Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 168 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 408efbe34753..ad9c73cc20e1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -105,6 +107,21 @@ static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) return !!msk->cached_ext; } +static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + sock_owned_by_me(sk); + + mptcp_for_each_subflow(msk, subflow) { + if (subflow->data_avail) + return mptcp_subflow_tcp_sock(subflow); + } + + return NULL; +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct msghdr *msg, long *timeo) { @@ -269,13 +286,37 @@ int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb, return copy_len; } +static void mptcp_wait_data(struct sock *sk, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct mptcp_sock *msk = mptcp_sk(sk); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + + sk_wait_event(sk, timeo, + test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + bool more_data_avail = false; + struct mptcp_read_arg arg; + read_descriptor_t desc; + bool wait_data = false; struct socket *ssock; + struct tcp_sock *tp; + bool done = false; struct sock *ssk; int copied = 0; + int target; + long timeo; if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; @@ -290,16 +331,124 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return copied; } - ssk = mptcp_subflow_get(msk); - if (!ssk) { - release_sock(sk); - return -ENOTCONN; + arg.msg = msg; + desc.arg.data = &arg; + desc.error = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + + len = min_t(size_t, len, INT_MAX); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + while (!done) { + u32 map_remaining; + int bytes_read; + + ssk = mptcp_subflow_recv_lookup(msk); + pr_debug("msk=%p ssk=%p", msk, ssk); + if (!ssk) + goto wait_for_data; + + subflow = mptcp_subflow_ctx(ssk); + tp = tcp_sk(ssk); + + lock_sock(ssk); + do { + /* try to read as much data as available */ + map_remaining = subflow->map_data_len - + mptcp_subflow_get_map_offset(subflow); + desc.count = min_t(size_t, len - copied, map_remaining); + pr_debug("reading %zu bytes, copied %d", desc.count, + copied); + bytes_read = tcp_read_sock(ssk, &desc, + mptcp_read_actor); + if (bytes_read < 0) { + if (!copied) + copied = bytes_read; + done = true; + goto next; + } + + pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq, + msk->ack_seq + bytes_read); + msk->ack_seq += bytes_read; + copied += bytes_read; + if (copied >= len) { + done = true; + goto next; + } + if (tp->urg_data && tp->urg_seq == tp->copied_seq) { + pr_err("Urgent data present, cannot proceed"); + done = true; + goto next; + } +next: + more_data_avail = mptcp_subflow_data_available(ssk); + } while (more_data_avail && !done); + release_sock(ssk); + continue; + +wait_for_data: + more_data_avail = false; + + /* only the master socket status is relevant here. The exit + * conditions mirror closely tcp_recvmsg() + */ + if (copied >= target) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + copied = -ENOTCONN; + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + pr_debug("block timeout %ld", timeo); + wait_data = true; + mptcp_wait_data(sk, &timeo); } - copied = sock_recvmsg(ssk->sk_socket, msg, flags); + if (more_data_avail) { + if (!test_bit(MPTCP_DATA_READY, &msk->flags)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } else if (!wait_data) { + clear_bit(MPTCP_DATA_READY, &msk->flags); - release_sock(sk); + /* .. race-breaker: ssk might get new data after last + * data_available() returns false. + */ + ssk = mptcp_subflow_recv_lookup(msk); + if (unlikely(ssk)) + set_bit(MPTCP_DATA_READY, &msk->flags); + } + release_sock(sk); return copied; } @@ -460,10 +609,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, msk->write_seq = subflow->idsn + 1; ack_seq++; msk->ack_seq = ack_seq; - subflow->map_seq = ack_seq; - subflow->map_subflow_seq = 1; - subflow->rel_write_seq = 1; - subflow->tcp_sock = ssk; newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); @@ -475,6 +620,19 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, bh_unlock_sock(new_mptcp_sock); local_bh_enable(); release_sock(sk); + + /* the subflow can already receive packet, avoid racing with + * the receive path and process the pending ones + */ + lock_sock(ssk); + subflow->map_seq = ack_seq; + subflow->map_subflow_seq = 1; + subflow->rel_write_seq = 1; + subflow->tcp_sock = ssk; + subflow->conn = new_mptcp_sock; + if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue))) + mptcp_subflow_data_available(ssk); + release_sock(ssk); } return newsk; -- cgit v1.2.3 From 57040755a3e43a1ee2d4ce9c83e87de27b570104 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 21 Jan 2020 16:56:27 -0800 Subject: mptcp: allow collapsing consecutive sendpages on the same substream If the current sendmsg() lands on the same subflow we used last, we can try to collapse the data. Signed-off-by: Paolo Abeni Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 75 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ad9c73cc20e1..0b21ae25bd0f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -122,14 +122,27 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) return NULL; } +static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) +{ + if (!tcp_skb_can_collapse_to(skb)) + return false; + + /* can collapse only if MPTCP level sequence is in order */ + return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, long *timeo) + struct msghdr *msg, long *timeo, int *pmss_now, + int *ps_goal) { - int mss_now = 0, size_goal = 0, ret = 0; + int mss_now, avail_size, size_goal, ret; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; + struct sk_buff *skb, *tail; + bool can_collapse = false; struct page_frag *pfrag; - struct sk_buff *skb; size_t psize; /* use the mptcp page cache so that we can easily move the data @@ -145,8 +158,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute copy limit */ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); - psize = min_t(int, pfrag->size - pfrag->offset, size_goal); + *pmss_now = mss_now; + *ps_goal = size_goal; + avail_size = size_goal; + skb = tcp_write_queue_tail(ssk); + if (skb) { + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + + /* Limit the write to the size available in the + * current skb, if any, so that we create at most a new skb. + * Explicitly tells TCP internals to avoid collapsing on later + * queue management operation, to avoid breaking the ext <-> + * SSN association set here + */ + can_collapse = (size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(msk, skb, mpext); + if (!can_collapse) + TCP_SKB_CB(skb)->eor = 1; + else + avail_size = size_goal - skb->len; + } + psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); + /* Copy to page */ pr_debug("left=%zu", msg_data_left(msg)); psize = copy_page_from_iter(pfrag->page, pfrag->offset, min_t(size_t, msg_data_left(msg), psize), @@ -155,14 +189,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (!psize) return -EINVAL; - /* Mark the end of the previous write so the beginning of the - * next write (with its own mptcp skb extension data) is not - * collapsed. + /* tell the TCP stack to delay the push so that we can safely + * access the skb after the sendpages call */ - skb = tcp_write_queue_tail(ssk); - if (skb) - TCP_SKB_CB(skb)->eor = 1; - ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST); if (ret <= 0) @@ -170,6 +199,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (unlikely(ret < psize)) iov_iter_revert(&msg->msg_iter, psize - ret); + /* if the tail skb extension is still the cached one, collapsing + * really happened. Note: we can't check for 'same skb' as the sk_buff + * hdr on tail can be transmitted, freed and re-allocated by the + * do_tcp_sendpages() call + */ + tail = tcp_write_queue_tail(ssk); + if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { + WARN_ON_ONCE(!can_collapse); + mpext->data_len += ret; + goto out; + } + skb = tcp_write_queue_tail(ssk); mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); msk->cached_ext = NULL; @@ -185,11 +226,11 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); +out: pfrag->offset += ret; msk->write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); return ret; } @@ -212,11 +253,11 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct socket *ssock; size_t copied = 0; struct sock *ssk; - int ret = 0; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -243,15 +284,19 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(ssk); while (msg_data_left(msg)) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo); + ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + &size_goal); if (ret < 0) break; copied += ret; } - if (copied > 0) + if (copied) { ret = copied; + tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + } ssk_check_wmem(msk, ssk); release_sock(ssk); -- cgit v1.2.3 From 784325e9f037e5f7a7f9a46ecbb27384128f8b6e Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 21 Jan 2020 16:56:28 -0800 Subject: mptcp: new sysctl to control the activation per NS New MPTCP sockets will return -ENOPROTOOPT if MPTCP support is disabled for the current net namespace. We are providing here a way to control access to the feature for those that need to turn it on or off. The value of this new sysctl can be different per namespace. We can then restrict the usage of MPTCP to the selected NS. In case of serious issues with MPTCP, administrators can now easily turn MPTCP off. Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Signed-off-by: Matthieu Baerts Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/Makefile | 2 +- net/mptcp/ctrl.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 16 +++++-- net/mptcp/protocol.h | 3 ++ 4 files changed, 146 insertions(+), 5 deletions(-) create mode 100644 net/mptcp/ctrl.c (limited to 'net') diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 178ae81d8b66..4e98d9edfd0a 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c new file mode 100644 index 000000000000..8e39585d37f3 --- /dev/null +++ b/net/mptcp/ctrl.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Tessares SA. + */ + +#include + +#include +#include + +#include "protocol.h" + +#define MPTCP_SYSCTL_PATH "net/mptcp" + +static int mptcp_pernet_id; +struct mptcp_pernet { + struct ctl_table_header *ctl_table_hdr; + + int mptcp_enabled; +}; + +static struct mptcp_pernet *mptcp_get_pernet(struct net *net) +{ + return net_generic(net, mptcp_pernet_id); +} + +int mptcp_is_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->mptcp_enabled; +} + +static struct ctl_table mptcp_sysctl_table[] = { + { + .procname = "enabled", + .maxlen = sizeof(int), + .mode = 0644, + /* users with CAP_NET_ADMIN or root (not and) can change this + * value, same as other sysctl or the 'net' tree. + */ + .proc_handler = proc_dointvec, + }, + {} +}; + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; +} + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = mptcp_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); + if (!table) + goto err_alloc; + } + + table[0].data = &pernet->mptcp_enabled; + + hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); + if (!hdr) + goto err_reg; + + pernet->ctl_table_hdr = hdr; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) +{ + struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + + unregister_net_sysctl_table(pernet->ctl_table_hdr); + + kfree(table); +} + +static int __net_init mptcp_net_init(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_set_defaults(pernet); + + return mptcp_pernet_new_table(net, pernet); +} + +/* Note: the callback will only be called per extra netns */ +static void __net_exit mptcp_net_exit(struct net *net) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + + mptcp_pernet_del_table(pernet); +} + +static struct pernet_operations mptcp_pernet_ops = { + .init = mptcp_net_init, + .exit = mptcp_net_exit, + .id = &mptcp_pernet_id, + .size = sizeof(struct mptcp_pernet), +}; + +void __init mptcp_init(void) +{ + mptcp_proto_init(); + + if (register_pernet_subsys(&mptcp_pernet_ops) < 0) + panic("Failed to register MPTCP pernet subsystem.\n"); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int __init mptcpv6_init(void) +{ + int err; + + err = mptcp_proto_v6_init(); + + return err; +} +#endif diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0b21ae25bd0f..45e482864a19 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -522,7 +522,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, } } -static int mptcp_init_sock(struct sock *sk) +static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -532,6 +532,14 @@ static int mptcp_init_sock(struct sock *sk) return 0; } +static int mptcp_init_sock(struct sock *sk) +{ + if (!mptcp_is_enabled(sock_net(sk))) + return -ENOPROTOOPT; + + return __mptcp_init_sock(sk); +} + static void mptcp_subflow_shutdown(struct sock *ssk, int how) { lock_sock(ssk); @@ -640,7 +648,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, return NULL; } - mptcp_init_sock(new_mptcp_sock); + __mptcp_init_sock(new_mptcp_sock); msk = mptcp_sk(new_mptcp_sock); msk->remote_key = subflow->remote_key; @@ -1078,7 +1086,7 @@ static struct inet_protosw mptcp_protosw = { .flags = INET_PROTOSW_ICSK, }; -void __init mptcp_init(void) +void mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; mptcp_stream_ops = inet_stream_ops; @@ -1116,7 +1124,7 @@ static struct inet_protosw mptcp_v6_protosw = { .flags = INET_PROTOSW_ICSK, }; -int mptcpv6_init(void) +int mptcp_proto_v6_init(void) { int err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 59a83eb64d37..a8fad7d78565 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -179,6 +179,9 @@ extern const struct inet_connection_sock_af_ops ipv6_specific; #endif void mptcp_proto_init(void); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +int mptcp_proto_v6_init(void); +#endif struct mptcp_read_arg { struct msghdr *msg; -- cgit v1.2.3 From 65492c5a6ab5df5091a77562dbcca2d2dc3877c0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 21 Jan 2020 16:56:30 -0800 Subject: mptcp: move from sha1 (v0) to sha256 (v1) For simplicity's sake use directly sha256 primitives (and pull them as a required build dep). Add optional, boot-time self-tests for the hmac function. Signed-off-by: Paolo Abeni Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/Kconfig | 10 ++++ net/mptcp/crypto.c | 140 +++++++++++++++++++++++++++++++-------------------- net/mptcp/options.c | 9 +++- net/mptcp/protocol.h | 4 +- 4 files changed, 104 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index c1a99f07a4cd..5db56d2218c5 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -3,6 +3,7 @@ config MPTCP bool "MPTCP: Multipath TCP" depends on INET select SKB_EXTENSIONS + select CRYPTO_LIB_SHA256 help Multipath TCP (MPTCP) connections send and receive data over multiple subflows in order to utilize multiple network paths. Each subflow @@ -14,3 +15,12 @@ config MPTCP_IPV6 depends on MPTCP select IPV6 default y + +config MPTCP_HMAC_TEST + bool "Tests for MPTCP HMAC implementation" + default n + help + This option enable boot time self-test for the HMAC implementation + used by the MPTCP code + + Say N if you are unsure. diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index bbd6d01af211..40d1bb18fd60 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -21,67 +21,36 @@ */ #include -#include +#include #include #include "protocol.h" -struct sha1_state { - u32 workspace[SHA_WORKSPACE_WORDS]; - u32 digest[SHA_DIGEST_WORDS]; - unsigned int count; -}; - -static void sha1_init(struct sha1_state *state) -{ - sha_init(state->digest); - state->count = 0; -} - -static void sha1_update(struct sha1_state *state, u8 *input) -{ - sha_transform(state->digest, input, state->workspace); - state->count += SHA_MESSAGE_BYTES; -} - -static void sha1_pad_final(struct sha1_state *state, u8 *input, - unsigned int length, __be32 *mptcp_hashed_key) -{ - int i; - - input[length] = 0x80; - memset(&input[length + 1], 0, SHA_MESSAGE_BYTES - length - 9); - put_unaligned_be64((length + state->count) << 3, - &input[SHA_MESSAGE_BYTES - 8]); - - sha_transform(state->digest, input, state->workspace); - for (i = 0; i < SHA_DIGEST_WORDS; ++i) - put_unaligned_be32(state->digest[i], &mptcp_hashed_key[i]); - - memzero_explicit(state->workspace, SHA_WORKSPACE_WORDS << 2); -} +#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) { - __be32 mptcp_hashed_key[SHA_DIGEST_WORDS]; - u8 input[SHA_MESSAGE_BYTES]; - struct sha1_state state; + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be64 input = cpu_to_be64(key); + struct sha256_state state; - sha1_init(&state); - put_unaligned_be64(key, input); - sha1_pad_final(&state, input, 8, mptcp_hashed_key); + sha256_init(&state); + sha256_update(&state, (__force u8 *)&input, sizeof(input)); + sha256_final(&state, (u8 *)mptcp_hashed_key); if (token) *token = be32_to_cpu(mptcp_hashed_key[0]); if (idsn) - *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[3])); + *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); } void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - u32 *hash_out) + void *hmac) { - u8 input[SHA_MESSAGE_BYTES * 2]; - struct sha1_state state; + u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; + __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + __be32 *hash_out = (__force __be32 *)hmac; + struct sha256_state state; u8 key1be[8]; u8 key2be[8]; int i; @@ -96,17 +65,16 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; - put_unaligned_be32(nonce1, &input[SHA_MESSAGE_BYTES]); - put_unaligned_be32(nonce2, &input[SHA_MESSAGE_BYTES + 4]); + put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); + put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); - sha1_init(&state); - sha1_update(&state, input); + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); /* emit sha256(K1 || msg) on the second input block, so we can * reuse 'input' for the last hashing */ - sha1_pad_final(&state, &input[SHA_MESSAGE_BYTES], 8, - (__force __be32 *)&input[SHA_MESSAGE_BYTES]); + sha256_final(&state, &input[SHA256_BLOCK_SIZE]); /* Prepare second part of hmac */ memset(input, 0x5C, SHA_MESSAGE_BYTES); @@ -115,8 +83,70 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; - sha1_init(&state); - sha1_update(&state, input); - sha1_pad_final(&state, &input[SHA_MESSAGE_BYTES], SHA_DIGEST_WORDS << 2, - (__be32 *)hash_out); + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + /* takes only first 160 bits */ + for (i = 0; i < 5; i++) + hash_out[i] = mptcp_hashed_key[i]; +} + +#ifdef CONFIG_MPTCP_HMAC_TEST +struct test_cast { + char *key; + char *msg; + char *result; +}; + +/* we can't reuse RFC 4231 test vectors, as we have constraint on the + * input and key size, and we truncate the output. + */ +static struct test_cast tests[] = { + { + .key = "0b0b0b0b0b0b0b0b", + .msg = "48692054", + .result = "8385e24fb4235ac37556b6b886db106284a1da67", + }, + { + .key = "aaaaaaaaaaaaaaaa", + .msg = "dddddddd", + .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492", + }, + { + .key = "0102030405060708", + .msg = "cdcdcdcd", + .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6", + }, +}; + +static int __init test_mptcp_crypto(void) +{ + char hmac[20], hmac_hex[41]; + u32 nonce1, nonce2; + u64 key1, key2; + int i, j; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + /* mptcp hmap will convert to be before computing the hmac */ + key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0])); + key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8])); + nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); + nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); + + mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + for (j = 0; j < 20; ++j) + sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); + hmac_hex[40] = 0; + + if (memcmp(hmac_hex, tests[i].result, 40)) + pr_err("test %d failed, got %s expected %s", i, + hmac_hex, tests[i].result); + else + pr_info("test %d [ ok ]", i); + } + return 0; } + +late_initcall(test_mptcp_crypto); +#endif diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1fa8496f3551..1aec742ca8e1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -9,6 +9,11 @@ #include #include "protocol.h" +static bool mptcp_cap_flag_sha256(u8 flags) +{ + return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; +} + void mptcp_parse_option(const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { @@ -29,7 +34,7 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, break; flags = *ptr++; - if (!((flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA1) || + if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; @@ -399,7 +404,7 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | (MPTCPOPT_MP_CAPABLE << 12) | (MPTCP_SUPPORTED_VERSION << 8) | - MPTCP_CAP_HMAC_SHA1); + MPTCP_CAP_HMAC_SHA256); put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (OPTION_MPTCP_MPC_ACK & opts->suboptions) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a8fad7d78565..a355bb1cf31b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -43,7 +43,7 @@ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) -#define MPTCP_CAP_HMAC_SHA1 BIT(0) +#define MPTCP_CAP_HMAC_SHA256 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x3F) /* MPTCP DSS flags */ @@ -216,7 +216,7 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) } void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - u32 *hash_out); + void *hash_out); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { -- cgit v1.2.3 From cc7972ea1932335e0a0ee00ac8a24b3e8304630d Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 21 Jan 2020 16:56:31 -0800 Subject: mptcp: parse and emit MP_CAPABLE option according to v1 spec This implements MP_CAPABLE options parsing and writing according to RFC 6824 bis / RFC 8684: MPTCP v1. Local key is sent on syn/ack, and both keys are sent on 3rd ack. MP_CAPABLE messages len are updated accordingly. We need the skbuff to correctly emit the above, so we push the skbuff struct as an argument all the way from tcp code to the relevant mptcp callbacks. When processing incoming MP_CAPABLE + data, build a full blown DSS-like map info, to simplify later processing. On child socket creation, we need to record the remote key, if available. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +- include/net/mptcp.h | 17 +++--- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/mptcp/options.c | 162 ++++++++++++++++++++++++++++++++++++++++---------- net/mptcp/protocol.h | 6 +- net/mptcp/subflow.c | 14 ++++- 7 files changed, 160 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0d00dad4b85d..4e2124607d32 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -94,7 +94,8 @@ struct mptcp_options_received { data_fin:1, use_ack:1, ack64:1, - __unused:3; + mpc_map:1, + __unused:2; }; #endif diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8619c1fca741..27627e2d1bc2 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -23,7 +23,8 @@ struct mptcp_ext { data_fin:1, use_ack:1, ack64:1, - __unused:3; + mpc_map:1, + __unused:2; /* one byte hole */ }; @@ -50,10 +51,10 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } -void mptcp_parse_option(const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx); -bool mptcp_syn_options(struct sock *sk, unsigned int *size, - struct mptcp_out_options *opts); +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx); +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts); void mptcp_rcv_synsent(struct sock *sk); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); @@ -121,12 +122,14 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return false; } -static inline void mptcp_parse_option(const unsigned char *ptr, int opsize, +static inline void mptcp_parse_option(const struct sk_buff *skb, + const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } -static inline bool mptcp_syn_options(struct sock *sk, unsigned int *size, +static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) { return false; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28d31f2c1422..2f475b897c11 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3926,7 +3926,7 @@ void tcp_parse_options(const struct net *net, break; #endif case TCPOPT_MPTCP: - mptcp_parse_option(ptr, opsize, opt_rx); + mptcp_parse_option(skb, ptr, opsize, opt_rx); break; case TCPOPT_FASTOPEN: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5456076166da..fec4b3a4b22d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -685,7 +685,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (sk_is_mptcp(sk)) { unsigned int size; - if (mptcp_syn_options(sk, &size, &opts->mptcp)) { + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { opts->options |= OPTION_MPTCP; remaining -= size; } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1aec742ca8e1..8f82ff9a5a8e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -14,8 +14,8 @@ static bool mptcp_cap_flag_sha256(u8 flags) return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } -void mptcp_parse_option(const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx) +void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, + int opsize, struct tcp_options_received *opt_rx) { struct mptcp_options_received *mp_opt = &opt_rx->mptcp; u8 subtype = *ptr >> 4; @@ -25,13 +25,29 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, switch (subtype) { case MPTCPOPT_MP_CAPABLE: - if (opsize != TCPOLEN_MPTCP_MPC_SYN && - opsize != TCPOLEN_MPTCP_MPC_ACK) + /* strict size checking */ + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + if (skb->len > tcp_hdr(skb)->doff << 2) + expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; + else + expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + } else { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; + else + expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + } + if (opsize != expected_opsize) break; + /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; - if (version != MPTCP_SUPPORTED_VERSION) + if (opsize != TCPOLEN_MPTCP_MPC_SYN) { + if (version != MPTCP_SUPPORTED_VERSION) + break; + } else if (version < MPTCP_SUPPORTED_VERSION) { break; + } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || @@ -55,23 +71,40 @@ void mptcp_parse_option(const unsigned char *ptr, int opsize, break; mp_opt->mp_capable = 1; - mp_opt->sndr_key = get_unaligned_be64(ptr); - ptr += 8; - - if (opsize == TCPOLEN_MPTCP_MPC_ACK) { + if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { + mp_opt->sndr_key = get_unaligned_be64(ptr); + ptr += 8; + } + if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; - pr_debug("MP_CAPABLE sndr=%llu, rcvr=%llu", - mp_opt->sndr_key, mp_opt->rcvr_key); - } else { - pr_debug("MP_CAPABLE sndr=%llu", mp_opt->sndr_key); } + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + /* Section 3.1.: + * "the data parameters in a MP_CAPABLE are semantically + * equivalent to those in a DSS option and can be used + * interchangeably." + */ + mp_opt->dss = 1; + mp_opt->use_map = 1; + mp_opt->mpc_map = 1; + mp_opt->data_len = get_unaligned_be16(ptr); + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + version, flags, opsize, mp_opt->sndr_key, + mp_opt->rcvr_key, mp_opt->data_len); break; case MPTCPOPT_DSS: pr_debug("DSS"); ptr++; + /* we must clear 'mpc_map' be able to detect MP_CAPABLE + * map vs DSS map in mptcp_incoming_options(), and reconstruct + * map info accordingly + */ + mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; @@ -176,18 +209,22 @@ void mptcp_get_options(const struct sk_buff *skb, if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) - mptcp_parse_option(ptr, opsize, opt_rx); + mptcp_parse_option(skb, ptr, opsize, opt_rx); ptr += opsize - 2; length -= opsize; } } } -bool mptcp_syn_options(struct sock *sk, unsigned int *size, - struct mptcp_out_options *opts) +bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, + unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + /* we will use snd_isn to detect first pkt [re]transmission + * in mptcp_established_options_mp() + */ + subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { pr_debug("local_key=%llu", subflow->local_key); opts->suboptions = OPTION_MPTCP_MPC_SYN; @@ -212,20 +249,52 @@ void mptcp_rcv_synsent(struct sock *sk) } } -static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size, +static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, + unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_ext *mpext; + unsigned int data_len; + + pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, + subflow->fourth_ack, subflow->snd_isn, + skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + + if (subflow->mp_capable && !subflow->fourth_ack && skb && + subflow->snd_isn == TCP_SKB_CB(skb)->seq) { + /* When skb is not available, we better over-estimate the + * emitted options len. A full DSS option is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit + * that. + */ + mpext = mptcp_get_ext(skb); + data_len = mpext ? mpext->data_len : 0; - if (!subflow->fourth_ack) { + /* we will check ext_copy.data_len in mptcp_write_options() to + * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and + * TCPOLEN_MPTCP_MPC_ACK + */ + opts->ext_copy.data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; - *size = TCPOLEN_MPTCP_MPC_ACK; - subflow->fourth_ack = 1; - pr_debug("subflow=%p, local_key=%llu, remote_key=%llu", - subflow, subflow->local_key, subflow->remote_key); + + /* Section 3.1. + * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK + * packets that start the first subflow of an MPTCP connection, + * as well as the first packet that carries data + */ + if (data_len > 0) + *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); + else + *size = TCPOLEN_MPTCP_MPC_ACK; + + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + subflow, subflow->local_key, subflow->remote_key, + data_len); + return true; } return false; @@ -319,7 +388,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int opt_size = 0; bool ret = false; - if (mptcp_established_options_mp(sk, &opt_size, remaining, opts)) + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, opts)) @@ -371,11 +440,26 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, memset(mpext, 0, sizeof(*mpext)); if (mp_opt->use_map) { - mpext->data_seq = mp_opt->data_seq; - mpext->subflow_seq = mp_opt->subflow_seq; + if (mp_opt->mpc_map) { + struct mptcp_subflow_context *subflow = + mptcp_subflow_ctx(sk); + + /* this is an MP_CAPABLE carrying MPTCP data + * we know this map the first chunk of data + */ + mptcp_crypto_key_sha(subflow->remote_key, NULL, + &mpext->data_seq); + mpext->data_seq++; + mpext->subflow_seq = 1; + mpext->dsn64 = 1; + mpext->mpc_map = 1; + } else { + mpext->data_seq = mp_opt->data_seq; + mpext->subflow_seq = mp_opt->subflow_seq; + mpext->dsn64 = mp_opt->dsn64; + } mpext->data_len = mp_opt->data_len; mpext->use_map = 1; - mpext->dsn64 = mp_opt->dsn64; } if (mp_opt->use_ack) { @@ -389,8 +473,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) { - if ((OPTION_MPTCP_MPC_SYN | - OPTION_MPTCP_MPC_SYNACK | + if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { u8 len; @@ -398,6 +481,8 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) len = TCPOLEN_MPTCP_MPC_SYN; else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) len = TCPOLEN_MPTCP_MPC_SYNACK; + else if (opts->ext_copy.data_len) + len = TCPOLEN_MPTCP_MPC_ACK_DATA; else len = TCPOLEN_MPTCP_MPC_ACK; @@ -405,14 +490,27 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) (MPTCPOPT_MP_CAPABLE << 12) | (MPTCP_SUPPORTED_VERSION << 8) | MPTCP_CAP_HMAC_SHA256); + + if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & + opts->suboptions)) + goto mp_capable_done; + put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; - if (OPTION_MPTCP_MPC_ACK & opts->suboptions) { - put_unaligned_be64(opts->rcvr_key, ptr); - ptr += 2; - } + if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + goto mp_capable_done; + + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + if (!opts->ext_copy.data_len) + goto mp_capable_done; + + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + ptr += 1; } +mp_capable_done: if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a355bb1cf31b..36b90024d34d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -11,7 +11,7 @@ #include #include -#define MPTCP_SUPPORTED_VERSION 0 +#define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) @@ -29,9 +29,10 @@ #define MPTCPOPT_MP_FASTCLOSE 7 /* MPTCP suboption lengths */ -#define TCPOLEN_MPTCP_MPC_SYN 12 +#define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 +#define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 @@ -106,6 +107,7 @@ struct mptcp_subflow_context { u64 remote_key; u64 idsn; u64 map_seq; + u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9fb3eb87a20f..8892855f4f52 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -77,7 +77,6 @@ static void subflow_init_req(struct request_sock *req, if (err == 0) subflow_req->mp_capable = 1; - subflow_req->remote_key = rx_opt.mptcp.sndr_key; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; } } @@ -180,11 +179,22 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); + struct mptcp_subflow_request_sock *subflow_req; + struct tcp_options_received opt_rx; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* if the sk is MP_CAPABLE, we already received the client key */ + /* if the sk is MP_CAPABLE, we need to fetch the client key */ + subflow_req = mptcp_subflow_rsk(req); + if (subflow_req->mp_capable) { + opt_rx.mptcp.mp_capable = 0; + mptcp_get_options(skb, &opt_rx); + if (!opt_rx.mptcp.mp_capable) + subflow_req->mp_capable = 0; + else + subflow_req->remote_key = opt_rx.mptcp.sndr_key; + } child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); -- cgit v1.2.3 From d22f4988ffecbe284e4d00e897525adbd0edd801 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 21 Jan 2020 16:56:32 -0800 Subject: mptcp: process MP_CAPABLE data option This patch implements the handling of MP_CAPABLE + data option, as per RFC 6824 bis / RFC 8684: MPTCP v1. On the server side we can receive the remote key after that the connection is established. We need to explicitly track the 'missing remote key' status and avoid emitting a mptcp ack until we get such info. When a late/retransmitted/OoO pkt carrying MP_CAPABLE[+data] option is received, we have to propagate the mptcp seq number info to the msk socket. To avoid ABBA locking issue, explicitly check for that in recvmsg(), where we own msk and subflow sock locks. The above also means that an established mp_capable subflow - still waiting for the remote key - can be 'downgraded' to plain TCP. Such change could potentially block a reader waiting for new data forever - as they hook to msk, while later wake-up after the downgrade will be on subflow only. The above issue is not handled here, we likely have to get rid of msk->fallback to handle that cleanly. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/options.c | 56 +++++++++++++++++++++++++++++++++++++++++----------- net/mptcp/protocol.c | 16 ++++++++------- net/mptcp/protocol.h | 10 +++++++--- net/mptcp/subflow.c | 40 ++++++++++++++++++++++++++++++++----- 4 files changed, 95 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8f82ff9a5a8e..45acd877bef3 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -243,6 +243,7 @@ void mptcp_rcv_synsent(struct sock *sk) pr_debug("subflow=%p", subflow); if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { subflow->mp_capable = 1; + subflow->can_ack = 1; subflow->remote_key = tp->rx_opt.mptcp.sndr_key; } else { tcp_sk(sk)->is_mptcp = 0; @@ -332,6 +333,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_ext *mpext; struct mptcp_sock *msk; unsigned int ack_size; + bool ret = false; u8 tcp_fin; if (skb) { @@ -355,6 +357,14 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, if (skb && tcp_fin && subflow->conn->sk_state != TCP_ESTABLISHED) mptcp_write_data_fin(subflow, &opts->ext_copy); + ret = true; + } + + opts->ext_copy.use_ack = 0; + msk = mptcp_sk(subflow->conn); + if (!msk || !READ_ONCE(msk->can_ack)) { + *size = ALIGN(dss_size, 4); + return ret; } ack_size = TCPOLEN_MPTCP_DSS_ACK64; @@ -365,15 +375,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, dss_size += ack_size; - msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn); - if (msk) { - opts->ext_copy.data_ack = msk->ack_seq; - } else { - mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key, - NULL, &opts->ext_copy.data_ack); - opts->ext_copy.data_ack++; - } - + opts->ext_copy.data_ack = msk->ack_seq; opts->ext_copy.ack64 = 1; opts->ext_copy.use_ack = 1; @@ -422,13 +424,46 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return false; } +static bool check_fourth_ack(struct mptcp_subflow_context *subflow, + struct sk_buff *skb, + struct mptcp_options_received *mp_opt) +{ + /* here we can process OoO, in-window pkts, only in-sequence 4th ack + * are relevant + */ + if (likely(subflow->fourth_ack || + TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) + return true; + + if (mp_opt->use_ack) + subflow->fourth_ack = 1; + + if (subflow->can_ack) + return true; + + /* If the first established packet does not contain MP_CAPABLE + data + * then fallback to TCP + */ + if (!mp_opt->mp_capable) { + subflow->mp_capable = 0; + tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; + return false; + } + subflow->remote_key = mp_opt->sndr_key; + subflow->can_ack = 1; + return true; +} + void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct tcp_options_received *opt_rx) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received *mp_opt; struct mptcp_ext *mpext; mp_opt = &opt_rx->mptcp; + if (!check_fourth_ack(subflow, skb, mp_opt)) + return; if (!mp_opt->dss) return; @@ -441,9 +476,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, if (mp_opt->use_map) { if (mp_opt->mpc_map) { - struct mptcp_subflow_context *subflow = - mptcp_subflow_ctx(sk); - /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 45e482864a19..1b64dfaa5f63 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -30,7 +30,7 @@ */ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) { - if (!msk->subflow || mptcp_subflow_ctx(msk->subflow->sk)->fourth_ack) + if (!msk->subflow || READ_ONCE(msk->can_ack)) return NULL; return msk->subflow; @@ -651,17 +651,20 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, __mptcp_init_sock(new_mptcp_sock); msk = mptcp_sk(new_mptcp_sock); - msk->remote_key = subflow->remote_key; msk->local_key = subflow->local_key; msk->token = subflow->token; msk->subflow = NULL; mptcp_token_update_accept(newsk, new_mptcp_sock); - mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); msk->write_seq = subflow->idsn + 1; - ack_seq++; - msk->ack_seq = ack_seq; + if (subflow->can_ack) { + msk->can_ack = true; + msk->remote_key = subflow->remote_key; + mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); + ack_seq++; + msk->ack_seq = ack_seq; + } newsk = new_mptcp_sock; mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); @@ -678,8 +681,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, * the receive path and process the pending ones */ lock_sock(ssk); - subflow->map_seq = ack_seq; - subflow->map_subflow_seq = 1; subflow->rel_write_seq = 1; subflow->tcp_sock = ssk; subflow->conn = new_mptcp_sock; @@ -795,6 +796,7 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->token, subflow->token); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->can_ack, 1); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 36b90024d34d..10eaa7c7381b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -69,6 +69,7 @@ struct mptcp_sock { u64 ack_seq; u32 token; unsigned long flags; + bool can_ack; struct list_head conn_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ @@ -84,9 +85,10 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) struct mptcp_subflow_request_sock { struct tcp_request_sock sk; - u8 mp_capable : 1, + u16 mp_capable : 1, mp_join : 1, - backup : 1; + backup : 1, + remote_key_valid : 1; u64 local_key; u64 remote_key; u64 idsn; @@ -118,8 +120,10 @@ struct mptcp_subflow_context { fourth_ack : 1, /* send initial DSS */ conn_finished : 1, map_valid : 1, + mpc_map : 1, data_avail : 1, - rx_eof : 1; + rx_eof : 1, + can_ack : 1; /* only after processing the remote a key */ struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8892855f4f52..8cfa1d29d59c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -61,6 +61,7 @@ static void subflow_init_req(struct request_sock *req, mptcp_get_options(skb, &rx_opt); subflow_req->mp_capable = 0; + subflow_req->remote_key_valid = 0; #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -185,17 +186,28 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* if the sk is MP_CAPABLE, we need to fetch the client key */ + /* if the sk is MP_CAPABLE, we try to fetch the client key */ subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { + if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { + /* here we can receive and accept an in-window, + * out-of-order pkt, which will not carry the MP_CAPABLE + * opt even on mptcp enabled paths + */ + goto create_child; + } + opt_rx.mptcp.mp_capable = 0; mptcp_get_options(skb, &opt_rx); - if (!opt_rx.mptcp.mp_capable) - subflow_req->mp_capable = 0; - else + if (opt_rx.mptcp.mp_capable) { subflow_req->remote_key = opt_rx.mptcp.sndr_key; + subflow_req->remote_key_valid = 1; + } else { + subflow_req->mp_capable = 0; + } } +create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); @@ -377,6 +389,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk) subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; + subflow->mpc_map = mpext->mpc_map; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len); @@ -428,6 +441,19 @@ static bool subflow_check_data_avail(struct sock *ssk) if (WARN_ON_ONCE(!skb)) return false; + /* if msk lacks the remote key, this subflow must provide an + * MP_CAPABLE-based mapping + */ + if (unlikely(!READ_ONCE(msk->can_ack))) { + if (!subflow->mpc_map) { + ssk->sk_err = EBADMSG; + goto fatal; + } + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->ack_seq, subflow->map_seq); + WRITE_ONCE(msk->can_ack, true); + } + old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, @@ -752,13 +778,17 @@ static void subflow_ulp_clone(const struct request_sock *req, return; } + /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully + * established only after we receive the remote key + */ new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_write_space = old_ctx->tcp_write_space; new_ctx->mp_capable = 1; - new_ctx->fourth_ack = 1; + new_ctx->fourth_ack = subflow_req->remote_key_valid; + new_ctx->can_ack = subflow_req->remote_key_valid; new_ctx->remote_key = subflow_req->remote_key; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; -- cgit v1.2.3 From 8ab183deb26a3b79f8021afa9e83cc1bbd812031 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 21 Jan 2020 16:56:33 -0800 Subject: mptcp: cope with later TCP fallback With MPTCP v1, passive connections can fallback to TCP after the subflow becomes established: syn + MP_CAPABLE -> <- syn, ack + MP_CAPABLE ack, seq = 3 -> // OoO packet is accepted because in-sequence // passive socket is created, is in ESTABLISHED // status and tentatively as MP_CAPABLE ack, seq = 2 -> // no MP_CAPABLE opt, subflow should fallback to TCP We can't use the 'subflow' socket fallback, as we don't have it available for passive connection. Instead, when the fallback is detected, replace the mptcp socket with the underlying TCP subflow. Beyond covering the above scenario, it makes a TCP fallback socket as efficient as plain TCP ones. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 119 +++++++++++++++++++++++++++++++++++++++++++-------- net/mptcp/protocol.h | 1 + 2 files changed, 103 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1b64dfaa5f63..ee916b3686fe 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -24,6 +24,58 @@ #define MPTCP_SAME_STATE TCP_MAX_STATES +static void __mptcp_close(struct sock *sk, long timeout); + +static const struct proto_ops * tcp_proto_ops(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return &inet6_stream_ops; +#endif + return &inet_stream_ops; +} + +/* MP_CAPABLE handshake failed, convert msk to plain tcp, replacing + * socket->sk and stream ops and destroying msk + * return the msk socket, as we can't access msk anymore after this function + * completes + * Called with msk lock held, releases such lock before returning + */ +static struct socket *__mptcp_fallback_to_tcp(struct mptcp_sock *msk, + struct sock *ssk) +{ + struct mptcp_subflow_context *subflow; + struct socket *sock; + struct sock *sk; + + sk = (struct sock *)msk; + sock = sk->sk_socket; + subflow = mptcp_subflow_ctx(ssk); + + /* detach the msk socket */ + list_del_init(&subflow->node); + sock_orphan(sk); + sock->sk = NULL; + + /* socket is now TCP */ + lock_sock(ssk); + sock_graft(ssk, sock); + if (subflow->conn) { + /* We can't release the ULP data on a live socket, + * restore the tcp callback + */ + mptcp_subflow_tcp_fallback(ssk, subflow); + sock_put(subflow->conn); + subflow->conn = NULL; + } + release_sock(ssk); + sock->ops = tcp_proto_ops(ssk); + + /* destroy the left-over msk sock */ + __mptcp_close(sk, 0); + return sock; +} + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -36,25 +88,37 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } -/* if msk has a single subflow, and the mp_capable handshake is failed, - * return it. +static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) +{ + return msk->first && !sk_is_mptcp(msk->first); +} + +/* if the mp_capable handshake has failed, it fallbacks msk to plain TCP, + * releases the socket lock and returns a reference to the now TCP socket. * Otherwise returns NULL */ -static struct socket *__mptcp_tcp_fallback(const struct mptcp_sock *msk) +static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) { - struct socket *ssock = __mptcp_nmpc_socket(msk); - sock_owned_by_me((const struct sock *)msk); - if (!ssock || sk_is_mptcp(ssock->sk)) + if (likely(!__mptcp_needs_tcp_fallback(msk))) return NULL; - return ssock; + if (msk->subflow) { + /* the first subflow is an active connection, discart the + * paired socket + */ + msk->subflow->sk = NULL; + sock_release(msk->subflow); + msk->subflow = NULL; + } + + return __mptcp_fallback_to_tcp(msk, msk->first); } static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) { - return ((struct sock *)msk)->sk_state == TCP_CLOSE; + return !msk->first; } static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) @@ -75,6 +139,7 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) if (err) return ERR_PTR(err); + msk->first = ssock->sk; msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); @@ -154,6 +219,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, ret = sk_stream_wait_memory(ssk, timeo); if (ret) return ret; + if (unlikely(__mptcp_needs_tcp_fallback(msk))) + return 0; } /* compute copy limit */ @@ -265,11 +332,11 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); ssock = __mptcp_tcp_fallback(msk); - if (ssock) { + if (unlikely(ssock)) { +fallback: pr_debug("fallback passthrough"); ret = sock_sendmsg(ssock, msg); - release_sock(sk); - return ret; + return ret >= 0 ? ret + copied : (copied ? copied : ret); } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -288,6 +355,11 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) &size_goal); if (ret < 0) break; + if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { + release_sock(ssk); + ssock = __mptcp_tcp_fallback(msk); + goto fallback; + } copied += ret; } @@ -368,11 +440,11 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, lock_sock(sk); ssock = __mptcp_tcp_fallback(msk); - if (ssock) { + if (unlikely(ssock)) { +fallback: pr_debug("fallback-read subflow=%p", mptcp_subflow_ctx(ssock->sk)); copied = sock_recvmsg(ssock, msg, flags); - release_sock(sk); return copied; } @@ -477,6 +549,8 @@ wait_for_data: pr_debug("block timeout %ld", timeo); wait_data = true; mptcp_wait_data(sk, &timeo); + if (unlikely(__mptcp_tcp_fallback(msk))) + goto fallback; } if (more_data_avail) { @@ -529,6 +603,8 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); __set_bit(MPTCP_SEND_SPACE, &msk->flags); + msk->first = NULL; + return 0; } @@ -563,7 +639,8 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how) release_sock(ssk); } -static void mptcp_close(struct sock *sk, long timeout) +/* Called with msk lock held, releases such lock before returning */ +static void __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); @@ -571,8 +648,6 @@ static void mptcp_close(struct sock *sk, long timeout) mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); - lock_sock(sk); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -585,6 +660,12 @@ static void mptcp_close(struct sock *sk, long timeout) sk_common_release(sk); } +static void mptcp_close(struct sock *sk, long timeout) +{ + lock_sock(sk); + __mptcp_close(sk, timeout); +} + static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -654,6 +735,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, msk->local_key = subflow->local_key; msk->token = subflow->token; msk->subflow = NULL; + msk->first = newsk; mptcp_token_update_accept(newsk, new_mptcp_sock); @@ -1007,8 +1089,8 @@ unlock_fail: static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - const struct mptcp_sock *msk; struct sock *sk = sock->sk; + struct mptcp_sock *msk; struct socket *ssock; __poll_t mask = 0; @@ -1024,6 +1106,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, release_sock(sk); sock_poll_wait(file, sock, wait); lock_sock(sk); + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock->ops->poll(file, ssock, NULL); if (test_bit(MPTCP_DATA_READY, &msk->flags)) mask = EPOLLIN | EPOLLRDNORM; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 10eaa7c7381b..8a99a2930284 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -73,6 +73,7 @@ struct mptcp_sock { struct list_head conn_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct sock *first; }; #define mptcp_for_each_subflow(__msk, __subflow) \ -- cgit v1.2.3 From e42f1ac626e7f799717d006e0f8393b6d6f9fc8c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Jan 2020 16:04:02 -0800 Subject: mptcp: do not inherit inet proto ops We need to initialise the struct ourselves, else we expose tcp-specific callbacks such as tcp_splice_read which will then trigger splat because the socket is an mptcp one: BUG: KASAN: slab-out-of-bounds in tcp_mstamp_refresh+0x80/0xa0 net/ipv4/tcp_output.c:57 Write of size 8 at addr ffff888116aa21d0 by task syz-executor.0/5478 CPU: 1 PID: 5478 Comm: syz-executor.0 Not tainted 5.5.0-rc6 #3 Call Trace: tcp_mstamp_refresh+0x80/0xa0 net/ipv4/tcp_output.c:57 tcp_rcv_space_adjust+0x72/0x7f0 net/ipv4/tcp_input.c:612 tcp_read_sock+0x622/0x990 net/ipv4/tcp.c:1674 tcp_splice_read+0x20b/0xb40 net/ipv4/tcp.c:791 do_splice+0x1259/0x1560 fs/splice.c:1205 To prevent build error with ipv6, add the recv/sendmsg function declaration to ipv6.h. The functions are already accessible "thanks" to retpoline related work, but they are currently only made visible by socket.c specific INDIRECT_CALLABLE macros. Reported-by: Christoph Paasch Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 +++ net/mptcp/protocol.c | 70 ++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4e95f6df508c..cec1a54401f2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1113,6 +1113,9 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags); /* * reassembly.c diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ee916b3686fe..41d49126d29a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1163,7 +1163,31 @@ out_unlock: return ret; } -static struct proto_ops mptcp_stream_ops; +static const struct proto_ops mptcp_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v4_getname, + .poll = mptcp_poll, + .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, @@ -1176,14 +1200,6 @@ static struct inet_protosw mptcp_protosw = { void mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; - mptcp_stream_ops = inet_stream_ops; - mptcp_stream_ops.bind = mptcp_bind; - mptcp_stream_ops.connect = mptcp_stream_connect; - mptcp_stream_ops.poll = mptcp_poll; - mptcp_stream_ops.accept = mptcp_stream_accept; - mptcp_stream_ops.getname = mptcp_v4_getname; - mptcp_stream_ops.listen = mptcp_listen; - mptcp_stream_ops.shutdown = mptcp_shutdown; mptcp_subflow_init(); @@ -1194,7 +1210,32 @@ void mptcp_proto_init(void) } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static struct proto_ops mptcp_v6_stream_ops; +static const struct proto_ops mptcp_v6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = mptcp_bind, + .connect = mptcp_stream_connect, + .socketpair = sock_no_socketpair, + .accept = mptcp_stream_accept, + .getname = mptcp_v6_getname, + .poll = mptcp_poll, + .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, + .listen = mptcp_listen, + .shutdown = mptcp_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet6_sendmsg, + .recvmsg = inet6_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + static struct proto mptcp_v6_prot; static void mptcp_v6_destroy(struct sock *sk) @@ -1226,15 +1267,6 @@ int mptcp_proto_v6_init(void) if (err) return err; - mptcp_v6_stream_ops = inet6_stream_ops; - mptcp_v6_stream_ops.bind = mptcp_bind; - mptcp_v6_stream_ops.connect = mptcp_stream_connect; - mptcp_v6_stream_ops.poll = mptcp_poll; - mptcp_v6_stream_ops.accept = mptcp_stream_accept; - mptcp_v6_stream_ops.getname = mptcp_v6_getname; - mptcp_v6_stream_ops.listen = mptcp_listen; - mptcp_v6_stream_ops.shutdown = mptcp_shutdown; - err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); -- cgit v1.2.3 From edc7e4898d5fdb164de1ab9d3a0c30ef9888e4f1 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 24 Jan 2020 16:04:03 -0800 Subject: mptcp: Fix code formatting checkpatch.pl had a few complaints in the last set of MPTCP patches: ERROR: code indent should use tabs where possible +^I subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);$ CHECK: Comparison to NULL could be written "!new_ctx" + if (new_ctx == NULL) { ERROR: "foo * bar" should be "foo *bar" +static const struct proto_ops * tcp_proto_ops(struct sock *sk) Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- net/mptcp/subflow.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 41d49126d29a..39fdca79ce90 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -26,7 +26,7 @@ static void __mptcp_close(struct sock *sk, long timeout); -static const struct proto_ops * tcp_proto_ops(struct sock *sk) +static const struct proto_ops *tcp_proto_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8cfa1d29d59c..1662e1178949 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -592,7 +592,7 @@ void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped) target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", - subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); + subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; @@ -773,7 +773,7 @@ static void subflow_ulp_clone(const struct request_sock *req, } new_ctx = subflow_create_ctx(newsk, priority); - if (new_ctx == NULL) { + if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } -- cgit v1.2.3 From 79ac522402fca97deef3e07be9600bfab20d47a8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Jan 2020 16:20:02 +0200 Subject: net: atm: use %*ph to print small buffer Use %*ph format to print small buffer as hex string. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- net/atm/atm_sysfs.c | 22 ++++++---------- net/atm/lec.c | 76 ++++++++++++++--------------------------------------- 2 files changed, 28 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 39b94ca5f65d..aa1b57161f3b 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -33,23 +33,17 @@ static ssize_t show_atmaddress(struct device *cdev, unsigned long flags; struct atm_dev *adev = to_atm_dev(cdev); struct atm_dev_addr *aaddr; - int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin; - int i, j, count = 0; + int count = 0; spin_lock_irqsave(&adev->lock, flags); list_for_each_entry(aaddr, &adev->local, entry) { - for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) { - if (j == *fmt) { - count += scnprintf(buf + count, - PAGE_SIZE - count, "."); - ++fmt; - j = 0; - } - count += scnprintf(buf + count, - PAGE_SIZE - count, "%02x", - aaddr->addr.sas_addr.prv[i]); - } - count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); + count += scnprintf(buf + count, PAGE_SIZE - count, + "%1phN.%2phN.%10phN.%6phN.%1phN\n", + &aaddr->addr.sas_addr.prv[0], + &aaddr->addr.sas_addr.prv[1], + &aaddr->addr.sas_addr.prv[3], + &aaddr->addr.sas_addr.prv[13], + &aaddr->addr.sas_addr.prv[19]); } spin_unlock_irqrestore(&adev->lock, flags); diff --git a/net/atm/lec.c b/net/atm/lec.c index b57368e70aab..25fa3a7b72bd 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -799,14 +799,9 @@ static const char *lec_arp_get_status_string(unsigned char status) static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) { - int i; - - for (i = 0; i < ETH_ALEN; i++) - seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff); - seq_printf(seq, " "); - for (i = 0; i < ATM_ESA_LEN; i++) - seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff); - seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status), + seq_printf(seq, "%pM ", entry->mac_addr); + seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr); + seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status), entry->flags & 0xffff); if (entry->vcc) seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); @@ -1354,7 +1349,7 @@ static void dump_arp_table(struct lec_priv *priv) { struct lec_arp_table *rulla; char buf[256]; - int i, j, offset; + int i, offset; pr_info("Dump %p:\n", priv); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { @@ -1362,14 +1357,10 @@ static void dump_arp_table(struct lec_priv *priv) &priv->lec_arp_tables[i], next) { offset = 0; offset += sprintf(buf, "%d: %p\n", i, rulla); - offset += sprintf(buf + offset, "Mac: %pM", + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, - "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1392,12 +1383,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("No forward\n"); hlist_for_each_entry(rulla, &priv->lec_no_forward, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1417,12 +1405,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("Empty ones\n"); hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1442,12 +1427,9 @@ static void dump_arp_table(struct lec_priv *priv) pr_info("Multicast Forward VCCs\n"); hlist_for_each_entry(rulla, &priv->mcast_fwds, next) { offset = 0; - offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); - offset += sprintf(buf + offset, " Atm:"); - for (j = 0; j < ATM_ESA_LEN; j++) { - offset += sprintf(buf + offset, "%2.2x ", - rulla->atm_addr[j] & 0xff); - } + offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); + offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, + rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, @@ -1973,17 +1955,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, * Vcc which we don't want to make default vcc, * attach it anyway. */ - pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", - ioc_data->atm_addr[0], ioc_data->atm_addr[1], - ioc_data->atm_addr[2], ioc_data->atm_addr[3], - ioc_data->atm_addr[4], ioc_data->atm_addr[5], - ioc_data->atm_addr[6], ioc_data->atm_addr[7], - ioc_data->atm_addr[8], ioc_data->atm_addr[9], - ioc_data->atm_addr[10], ioc_data->atm_addr[11], - ioc_data->atm_addr[12], ioc_data->atm_addr[13], - ioc_data->atm_addr[14], ioc_data->atm_addr[15], - ioc_data->atm_addr[16], ioc_data->atm_addr[17], - ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n", + ATM_ESA_LEN, ioc_data->atm_addr); entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; @@ -1999,17 +1972,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, dump_arp_table(priv); goto out; } - pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", - ioc_data->atm_addr[0], ioc_data->atm_addr[1], - ioc_data->atm_addr[2], ioc_data->atm_addr[3], - ioc_data->atm_addr[4], ioc_data->atm_addr[5], - ioc_data->atm_addr[6], ioc_data->atm_addr[7], - ioc_data->atm_addr[8], ioc_data->atm_addr[9], - ioc_data->atm_addr[10], ioc_data->atm_addr[11], - ioc_data->atm_addr[12], ioc_data->atm_addr[13], - ioc_data->atm_addr[14], ioc_data->atm_addr[15], - ioc_data->atm_addr[16], ioc_data->atm_addr[17], - ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n", + ATM_ESA_LEN, ioc_data->atm_addr); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { -- cgit v1.2.3 From 6ec8b6cd79a4360e375da99d848d63f8d4fb08b3 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 23 Jan 2020 19:57:13 +0200 Subject: devlink: Add health recover notifications on devlink flows Devlink health recover notifications were added only on driver direct updates of health_state through devlink_health_reporter_state_update(). Add notifications on updates of health_state by devlink flows of report and recover. Moved functions devlink_nl_health_reporter_fill() and devlink_recover_notify() to avoid forward declaration. Fixes: 97ff3bd37fac ("devlink: add devink notification when reporter update health state") Signed-off-by: Moshe Shemesh Signed-off-by: David S. Miller --- net/core/devlink.c | 176 +++++++++++++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 87 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 64367eeb21e6..ca1df0ec3c97 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4843,6 +4843,93 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_health_reporter *reporter, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + struct nlattr *reporter_attr; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + reporter_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_HEALTH_REPORTER); + if (!reporter_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + reporter->ops->name)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, + reporter->health_state)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, + reporter->error_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, + reporter->recovery_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period, + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->ops->recover && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + reporter->auto_recover)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts), + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + + nla_nest_end(msg, reporter_attr); + genlmsg_end(msg, hdr); + return 0; + +reporter_nest_cancel: + nla_nest_end(msg, reporter_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_recover_notify(struct devlink_health_reporter *reporter, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_health_reporter_fill(msg, reporter->devlink, + reporter, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(reporter->devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { @@ -4869,6 +4956,7 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); return 0; } @@ -4935,6 +5023,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->error_count++; prev_health_state = reporter->health_state; reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; + devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ if (reporter->auto_recover && @@ -5017,93 +5106,6 @@ devlink_health_reporter_put(struct devlink_health_reporter *reporter) refcount_dec(&reporter->refcount); } -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - reporter_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->ops->recover && - nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, - reporter->dump_real_ts, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; - -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_recover_notify(struct devlink_health_reporter *reporter, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_health_reporter_fill(msg, reporter->devlink, - reporter, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, - devlink_net(reporter->devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state) -- cgit v1.2.3 From c2070152747ebc294f018e4b28bf2d9f719a6b36 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 24 Jan 2020 15:23:05 +0200 Subject: net: sched: sch_tbf: Don't overwrite backlog before dumping In 2011, in commit b0460e4484f9 ("sch_tbf: report backlog information"), TBF started copying backlog depth from the child Qdisc before dumping, with the motivation that the backlog was otherwise not visible in "tc -s qdisc show". Later, in 2016, in commit 8d5958f424b6 ("sch_tbf: update backlog as well"), TBF got a full-blown backlog tracking. However it kept copying the child's backlog over before dumping. That line is now unnecessary, so remove it. As shown in the following example, backlog is still reported correctly: # tc -s qdisc show dev veth0 invisible qdisc tbf 1: root refcnt 2 rate 1Mbit burst 128Kb lat 82.8s Sent 505475370 bytes 406985 pkt (dropped 0, overlimits 812544 requeues 0) backlog 81972b 66p requeues 0 qdisc bfifo 0: parent 1:1 limit 10Mb Sent 505475370 bytes 406985 pkt (dropped 0, overlimits 0 requeues 0) backlog 81972b 66p requeues 0 Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- net/sched/sch_tbf.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 2cd94973795c..7ae317958090 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -441,7 +441,6 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest; struct tc_tbf_qopt opt; - sch->qstats.backlog = q->qdisc->qstats.backlog; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; -- cgit v1.2.3 From ef6aadcc76c97e25f62adc4e9d19684d3e5d0b87 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 24 Jan 2020 15:23:06 +0200 Subject: net: sched: Make TBF Qdisc offloadable Invoke ndo_setup_tc as appropriate to signal init / replacement, destroying and dumping of TBF Qdisc. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 22 +++++++++++++++++++ net/sched/sch_tbf.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ec3537fbdb1..11bdf6cb30bd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -850,6 +850,7 @@ enum tc_setup_type { TC_SETUP_QDISC_TAPRIO, TC_SETUP_FT, TC_SETUP_QDISC_ETS, + TC_SETUP_QDISC_TBF, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 47b115e2012a..ce036492986a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -854,4 +854,26 @@ struct tc_ets_qopt_offload { }; }; +enum tc_tbf_command { + TC_TBF_REPLACE, + TC_TBF_DESTROY, + TC_TBF_STATS, +}; + +struct tc_tbf_qopt_offload_replace_params { + struct psched_ratecfg rate; + u32 max_size; + struct gnet_stats_queue *qstats; +}; + +struct tc_tbf_qopt_offload { + enum tc_tbf_command command; + u32 handle; + u32 parent; + union { + struct tc_tbf_qopt_offload_replace_params replace_params; + struct tc_qopt_offload_stats stats; + }; +}; + #endif diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 7ae317958090..78e79029dc63 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -137,6 +138,52 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, return len; } +static void tbf_offload_change(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.replace_params.rate = q->rate; + qopt.replace_params.max_size = q->max_size; + qopt.replace_params.qstats = &sch->qstats; + + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static void tbf_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_tbf_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_TBF_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); +} + +static int tbf_offload_dump(struct Qdisc *sch) +{ + struct tc_tbf_qopt_offload qopt; + + qopt.command = TC_TBF_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); +} + /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -407,6 +454,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_unlock(sch); err = 0; + + tbf_offload_change(sch); done: return err; } @@ -432,6 +481,7 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); + tbf_offload_destroy(sch); qdisc_put(q->qdisc); } @@ -440,6 +490,11 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *nest; struct tc_tbf_qopt opt; + int err; + + err = tbf_offload_dump(sch); + if (err) + return err; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) -- cgit v1.2.3 From cc974003615afa044fa62c7520ae690091fa684a Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 25 Jan 2020 09:23:47 +0100 Subject: Bluetooth: Add missing checks for HCI_ISODATA_PKT packet type The checks for HCI_ISODATA_PKT packet type are required in a few additional locations to allow sending/receiving of this new packet type. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 4 +++- net/bluetooth/hci_sock.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1ca7508b6ca7..cbbc34a006d1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3565,7 +3565,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { kfree_skb(skb); return -EINVAL; } @@ -4543,6 +4544,7 @@ static void hci_rx_work(struct work_struct *work) switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: + case HCI_ISODATA_PKT: kfree_skb(skb); continue; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 3ae508674ef7..5756b124c2cb 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -211,7 +211,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; if (is_filtered_packet(sk, skb)) continue; @@ -220,7 +221,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) continue; } else { /* Don't send frame to other channel types */ @@ -1768,7 +1770,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, */ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } @@ -1812,7 +1815,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, } if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && - hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { err = -EINVAL; goto drop; } -- cgit v1.2.3 From 18f81241b74fb49d576c83fbbab9a0b6e3bb20d4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sat, 25 Jan 2020 09:19:51 +0100 Subject: Bluetooth: Move {min,max}_key_size debugfs into hci_debugfs_create_le The debugfs entries for {min,max}_key_size are created during SMP registration and thus it might lead to multiple attempts to create the same entries. Avoid this by moving them to the LE controller init section. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_debugfs.c | 61 +++++++++++++++++++++++++++++ net/bluetooth/smp.c | 93 --------------------------------------------- 2 files changed, 61 insertions(+), 93 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 1c8100bc4e04..6b1314c738b8 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -26,6 +26,7 @@ #include #include +#include "smp.h" #include "hci_debugfs.h" #define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \ @@ -989,6 +990,62 @@ static int adv_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, adv_max_interval_set, "%llu\n"); +static int min_key_size_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val > hdev->le_max_key_size || val < SMP_MIN_ENC_KEY_SIZE) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_min_key_size = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int min_key_size_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->le_min_key_size; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(min_key_size_fops, min_key_size_get, + min_key_size_set, "%llu\n"); + +static int max_key_size_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val > SMP_MAX_ENC_KEY_SIZE || val < hdev->le_min_key_size) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_max_key_size = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int max_key_size_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->le_max_key_size; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(max_key_size_fops, max_key_size_get, + max_key_size_set, "%llu\n"); + static int auth_payload_timeout_set(void *data, u64 val) { struct hci_dev *hdev = data; @@ -1071,6 +1128,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &adv_max_interval_fops); debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + debugfs_create_file("min_key_size", 0644, hdev->debugfs, hdev, + &min_key_size_fops); + debugfs_create_file("max_key_size", 0644, hdev->debugfs, hdev, + &max_key_size_fops); debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, &auth_payload_timeout_fops); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4ece170c518e..204f14f8b507 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3373,94 +3373,6 @@ static const struct file_operations force_bredr_smp_fops = { .llseek = default_llseek, }; -static ssize_t le_min_key_size_read(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[4]; - - snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size); - - return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); -} - -static ssize_t le_min_key_size_write(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf) - 1)); - u8 key_size; - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - - sscanf(buf, "%hhu", &key_size); - - if (key_size > hdev->le_max_key_size || - key_size < SMP_MIN_ENC_KEY_SIZE) - return -EINVAL; - - hdev->le_min_key_size = key_size; - - return count; -} - -static const struct file_operations le_min_key_size_fops = { - .open = simple_open, - .read = le_min_key_size_read, - .write = le_min_key_size_write, - .llseek = default_llseek, -}; - -static ssize_t le_max_key_size_read(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[4]; - - snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size); - - return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); -} - -static ssize_t le_max_key_size_write(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf) - 1)); - u8 key_size; - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - - sscanf(buf, "%hhu", &key_size); - - if (key_size > SMP_MAX_ENC_KEY_SIZE || - key_size < hdev->le_min_key_size) - return -EINVAL; - - hdev->le_max_key_size = key_size; - - return count; -} - -static const struct file_operations le_max_key_size_fops = { - .open = simple_open, - .read = le_max_key_size_read, - .write = le_max_key_size_write, - .llseek = default_llseek, -}; - int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; @@ -3485,11 +3397,6 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; - debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev, - &le_min_key_size_fops); - debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev, - &le_max_key_size_fops); - /* If the controller does not support BR/EDR Secure Connections * feature, then the BR/EDR SMP channel shall not be present. * -- cgit v1.2.3 From 11eb85ec42dc8c7a7ec519b90ccf2eeae9409de8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 Jan 2020 20:49:04 +0300 Subject: Bluetooth: Fix race condition in hci_release_sock() Syzbot managed to trigger a use after free "KASAN: use-after-free Write in hci_sock_bind". I have reviewed the code manually and one possibly cause I have found is that we are not holding lock_sock(sk) when we do the hci_dev_put(hdev) in hci_sock_release(). My theory is that the bind and the release are racing against each other which results in this use after free. Reported-by: syzbot+eba992608adf3d796bcc@syzkaller.appspotmail.com Signed-off-by: Dan Carpenter Signed-off-by: Johan Hedberg --- net/bluetooth/hci_sock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 5756b124c2cb..9c4a093f8960 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -839,6 +839,8 @@ static int hci_sock_release(struct socket *sock) if (!sk) return 0; + lock_sock(sk); + switch (hci_pi(sk)->channel) { case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); @@ -886,6 +888,7 @@ static int hci_sock_release(struct socket *sock) skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); + release_sock(sk); sock_put(sk); return 0; } -- cgit v1.2.3 From 32efcc06d2a15fa87585614d12d6c2308cc2d3f3 Mon Sep 17 00:00:00 2001 From: Abdul Kabbani Date: Fri, 24 Jan 2020 16:34:02 -0500 Subject: tcp: export count for rehash attempts Using IPv6 flow-label to swiftly route around avoid congested or disconnected network path can greatly improve TCP reliability. This patch adds SNMP counters and a OPT_STATS counter to track both host-level and connection-level statistics. Network administrators can use these counters to evaluate the impact of this new ability better. Export count for rehash attempts to 1) two SNMP counters: TcpTimeoutRehash (rehash due to timeouts), and TcpDuplicateDataRehash (rehash due to receiving duplicate packets) 2) Timestamping API SOF_TIMESTAMPING_OPT_STATS. Signed-off-by: Abdul Kabbani Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Kevin(Yudong) Yang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ include/uapi/linux/snmp.h | 2 ++ include/uapi/linux/tcp.h | 1 + net/ipv4/proc.c | 2 ++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_timer.c | 6 ++++++ 7 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e2124607d32..1cf73e6f85ca 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -386,6 +386,8 @@ struct tcp_sock { #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif + u16 timeout_rehash; /* Timeout-triggered rehash attempts */ + u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* Receiver side RTT estimation */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 7eee233e78d2..7d91f4debc48 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -285,6 +285,8 @@ enum LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ + LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ + LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d87184e673ca..fd9eb8f6bcae 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -311,6 +311,7 @@ enum { TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ TCP_NLA_REORD_SEEN, /* reordering events seen */ TCP_NLA_SRTT, /* smoothed RTT in usecs */ + TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index cc90243ccf76..2580303249e2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), + SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), + SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2857c853e2fd..484485ae74c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3337,6 +3337,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ 0; } @@ -3391,6 +3392,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4915de65d278..e8b840a4767e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4271,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { sk_rethink_txhash(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1097b438befe..c3f26dcd6704 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -223,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; @@ -234,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = net->ipv4.sysctl_tcp_retries2; -- cgit v1.2.3 From 20a1452c35425b2cef76f21f8395ef069dfddfa9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Jan 2020 00:17:51 +0100 Subject: netfilter: nf_tables: add nft_setelem_parse_key() Add helper function to parse the set element key netlink attribute. v4: No changes v3: New patch [sbrivio: refactor error paths and labels; use NFT_DATA_VALUE_MAXLEN instead of sizeof(*key) in helper, value can be longer than that; rebase] Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 91 +++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7e63b481cc86..58e3b285a3d1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4524,11 +4524,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } +static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data *key, struct nlattr *attr) +{ + struct nft_data_desc desc; + int err; + + err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); + if (err < 0) + return err; + + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(key, desc.type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; @@ -4547,17 +4564,11 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(&elem.key.val, desc.type); - return err; - } - priv = set->ops->get(ctx->net, set, &elem, flags); if (IS_ERR(priv)) return PTR_ERR(priv); @@ -4756,13 +4767,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); - struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; struct nft_userdata *udata; + struct nft_data_desc desc; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; @@ -4828,15 +4839,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; - err = -EINVAL; - if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) - goto err2; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4859,13 +4867,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &d2, + err = nft_data_init(ctx, &data, sizeof(data), &desc, nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err2; err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) + if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) goto err3; dreg = nft_type_to_reg(set->dtype); @@ -4882,18 +4890,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_validate_register_store(&bind_ctx, dreg, &data, - d2.type, d2.len); + desc.type, desc.len); if (err < 0) goto err3; - if (d2.type == NFT_DATA_VERDICT && + if (desc.type == NFT_DATA_VERDICT && (data.verdict.code == NFT_GOTO || data.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); } /* The full maximum length of userdata can exceed the maximum @@ -4976,9 +4984,9 @@ err4: kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, d2.type); + nft_data_release(&data, desc.type); err2: - nft_data_release(&elem.key.val, d1.type); + nft_data_release(&elem.key.val, NFT_DATA_VALUE); err1: return err; } @@ -5074,7 +5082,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; - struct nft_data_desc desc; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; @@ -5085,11 +5092,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) - goto err1; + return err; - err = -EINVAL; if (nla[NFTA_SET_ELEM_KEY] == NULL) - goto err1; + return -EINVAL; nft_set_ext_prepare(&tmpl); @@ -5099,37 +5105,31 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; - - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - goto err2; + return err; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, 0, GFP_KERNEL); if (elem.priv == NULL) - goto err2; + goto fail_elem; ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) { - err = -ENOMEM; - goto err3; - } + if (trans == NULL) + goto fail_trans; priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; - goto err4; + goto fail_ops; } kfree(elem.priv); elem.priv = priv; @@ -5140,13 +5140,12 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err4: +fail_ops: kfree(trans); -err3: +fail_trans: kfree(elem.priv); -err2: - nft_data_release(&elem.key.val, desc.type); -err1: +fail_elem: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } -- cgit v1.2.3 From 7b225d0b5c6dda5fefab578175f210c6fc7e389a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Jan 2020 00:17:52 +0100 Subject: netfilter: nf_tables: add NFTA_SET_ELEM_KEY_END attribute Add NFTA_SET_ELEM_KEY_END attribute to convey the closing element of the interval between kernel and userspace. This patch also adds the NFT_SET_EXT_KEY_END extension to store the closing element value in this interval. v4: No changes v3: New patch [sbrivio: refactor error paths and labels; add corresponding nft_set_ext_type for new key; rebase] Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 +++++- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 85 +++++++++++++++++++++++--------- net/netfilter/nft_dynset.c | 2 +- 4 files changed, 79 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fe7c50acc681..504c0aa93805 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -231,6 +231,7 @@ struct nft_userdata { * struct nft_set_elem - generic representation of set elements * * @key: element key + * @key_end: closing element key * @priv: element private data and extensions */ struct nft_set_elem { @@ -238,6 +239,10 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; + union { + u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; + struct nft_data val; + } key_end; void *priv; }; @@ -502,6 +507,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key + * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout @@ -513,6 +519,7 @@ void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); */ enum nft_set_extensions { NFT_SET_EXT_KEY, + NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, @@ -606,6 +613,11 @@ static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_KEY); } +static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_KEY_END); +} + static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); @@ -655,7 +667,7 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, + const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 261864736b26..c13106496bd2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -370,6 +370,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING) + * @NFTA_SET_ELEM_KEY_END: closing key value (NLA_NESTED: nft_data) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -382,6 +383,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_EXPR, NFTA_SET_ELEM_PAD, NFTA_SET_ELEM_OBJREF, + NFTA_SET_ELEM_KEY_END, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58e3b285a3d1..5f645a85538a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4215,6 +4215,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, + [NFT_SET_EXT_KEY_END] = { + .align = __alignof__(u32), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -4233,6 +4236,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4282,6 +4286,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), + NFT_DATA_VALUE, set->klen) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, @@ -4569,6 +4578,13 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + } + priv = set->ops->get(ctx->net, set, &elem, flags); if (IS_ERR(priv)) return PTR_ERR(priv); @@ -4694,8 +4710,8 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp) + const u32 *key, const u32 *key_end, + const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4708,6 +4724,8 @@ void *nft_set_elem_init(const struct nft_set *set, nft_set_ext_init(ext, tmpl); memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -4842,9 +4860,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; + return err; nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + goto err_parse_key; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } + if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4854,14 +4882,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; - goto err2; + goto err_parse_key_end; } obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err2; + goto err_parse_key_end; } nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); } @@ -4870,11 +4898,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_data_init(ctx, &data, sizeof(data), &desc, nla[NFTA_SET_ELEM_DATA]); if (err < 0) - goto err2; + goto err_parse_key_end; err = -EINVAL; if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) - goto err3; + goto err_parse_data; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { @@ -4892,7 +4920,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, &data, desc.type, desc.len); if (err < 0) - goto err3; + goto err_parse_data; if (desc.type == NFT_DATA_VERDICT && (data.verdict.code == NFT_GOTO || @@ -4917,10 +4945,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, data.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) - goto err3; + goto err_parse_data; ext = nft_set_elem_ext(set, elem.priv); if (flags) @@ -4937,7 +4966,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) - goto err4; + goto err_trans; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(ctx->net, set, &elem, &ext2); @@ -4948,7 +4977,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { err = -EBUSY; - goto err5; + goto err_element_clash; } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && @@ -4961,33 +4990,35 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } - goto err5; + goto err_element_clash; } if (set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; - goto err6; + goto err_set_full; } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err6: +err_set_full: set->ops->remove(ctx->net, set, &elem); -err5: +err_element_clash: kfree(trans); -err4: +err_trans: if (obj) obj->use--; kfree(elem.priv); -err3: +err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&data, desc.type); -err2: +err_parse_key_end: + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); +err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); -err1: + return err; } @@ -5112,9 +5143,19 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } + err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - 0, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, NULL, 0, 0, + GFP_KERNEL); if (elem.priv == NULL) goto fail_elem; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 8887295414dc..683785225a3e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -54,7 +54,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, timeout = priv->timeout ? : set->timeout; elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], + ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); if (elem == NULL) -- cgit v1.2.3 From f3a2181e16f1dcbf5446ed43f6b5d9f56c459f85 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 22 Jan 2020 00:17:53 +0100 Subject: netfilter: nf_tables: Support for sets with multiple ranged fields Introduce a new nested netlink attribute, NFTA_SET_DESC_CONCAT, used to specify the length of each field in a set concatenation. This allows set implementations to support concatenation of multiple ranged items, as they can divide the input key into matching data for every single field. Such set implementations would be selected as they specify support for NFT_SET_INTERVAL and allow desc->field_count to be greater than one. Explicitly disallow this for nft_set_rbtree. In order to specify the interval for a set entry, userspace would include in NFTA_SET_DESC_CONCAT attributes field lengths, and pass range endpoints as two separate keys, represented by attributes NFTA_SET_ELEM_KEY and NFTA_SET_ELEM_KEY_END. While at it, export the number of 32-bit registers available for packet matching, as nftables will need this to know the maximum number of field lengths that can be specified. For example, "packets with an IPv4 address between 192.0.2.0 and 192.0.2.42, with destination port between 22 and 25", can be expressed as two concatenated elements: NFTA_SET_ELEM_KEY: 192.0.2.0 . 22 NFTA_SET_ELEM_KEY_END: 192.0.2.42 . 25 and NFTA_SET_DESC_CONCAT attribute would contain: NFTA_LIST_ELEM NFTA_SET_FIELD_LEN: 4 NFTA_LIST_ELEM NFTA_SET_FIELD_LEN: 2 v4: No changes v3: Complete rework, NFTA_SET_DESC_CONCAT instead of NFTA_SET_SUBKEY v2: No changes Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 +++ include/uapi/linux/netfilter/nf_tables.h | 15 ++++++ net/netfilter/nf_tables_api.c | 90 +++++++++++++++++++++++++++++++- net/netfilter/nft_set_rbtree.c | 3 ++ 4 files changed, 115 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 504c0aa93805..4170c033d461 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -264,11 +264,15 @@ struct nft_set_iter { * @klen: key length * @dlen: data length * @size: number of set elements + * @field_len: length of each field in concatenation, bytes + * @field_count: number of concatenated fields in element */ struct nft_set_desc { unsigned int klen; unsigned int dlen; unsigned int size; + u8 field_len[NFT_REG32_COUNT]; + u8 field_count; }; /** @@ -409,6 +413,8 @@ void nft_unregister_set(struct nft_set_type *type); * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size + * @field_len: length of each field in concatenation, bytes + * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -435,6 +441,8 @@ struct nft_set { u32 dtype; u32 objtype; u32 size; + u8 field_len[NFT_REG32_COUNT]; + u8 field_count; u32 use; atomic_t nelems; u32 ndeact; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c13106496bd2..065218a20bb7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -48,6 +48,7 @@ enum nft_registers { #define NFT_REG_SIZE 16 #define NFT_REG32_SIZE 4 +#define NFT_REG32_COUNT (NFT_REG32_15 - NFT_REG32_00 + 1) /** * enum nft_verdicts - nf_tables internal verdicts @@ -301,14 +302,28 @@ enum nft_set_policies { * enum nft_set_desc_attributes - set element description * * @NFTA_SET_DESC_SIZE: number of elements in set (NLA_U32) + * @NFTA_SET_DESC_CONCAT: description of field concatenation (NLA_NESTED) */ enum nft_set_desc_attributes { NFTA_SET_DESC_UNSPEC, NFTA_SET_DESC_SIZE, + NFTA_SET_DESC_CONCAT, __NFTA_SET_DESC_MAX }; #define NFTA_SET_DESC_MAX (__NFTA_SET_DESC_MAX - 1) +/** + * enum nft_set_field_attributes - attributes of concatenated fields + * + * @NFTA_SET_FIELD_LEN: length of single field, in bits (NLA_U32) + */ +enum nft_set_field_attributes { + NFTA_SET_FIELD_UNSPEC, + NFTA_SET_FIELD_LEN, + __NFTA_SET_FIELD_MAX +}; +#define NFTA_SET_FIELD_MAX (__NFTA_SET_FIELD_MAX - 1) + /** * enum nft_set_attributes - nf_tables set netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5f645a85538a..d1318bdf49ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3391,6 +3391,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, + [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, @@ -3557,6 +3558,33 @@ static __be64 nf_jiffies64_to_msecs(u64 input) return cpu_to_be64(jiffies64_to_msecs(input)); } +static int nf_tables_fill_set_concat(struct sk_buff *skb, + const struct nft_set *set) +{ + struct nlattr *concat, *field; + int i; + + concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); + if (!concat) + return -ENOMEM; + + for (i = 0; i < set->field_count; i++) { + field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); + if (!field) + return -ENOMEM; + + if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, + htonl(set->field_len[i]))) + return -ENOMEM; + + nla_nest_end(skb, field); + } + + nla_nest_end(skb, concat); + + return 0; +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -3620,11 +3648,17 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (desc == NULL) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) goto nla_put_failure; + + if (set->field_count > 1 && + nf_tables_fill_set_concat(skb, set)) + goto nla_put_failure; + nla_nest_end(skb, desc); nlmsg_end(skb, nlh); @@ -3797,6 +3831,53 @@ err: return err; } +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_set_desc_concat_parse(const struct nlattr *attr, + struct nft_set_desc *desc) +{ + struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; + u32 len; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, + nft_concat_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_SET_FIELD_LEN]) + return -EINVAL; + + len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); + + if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT) + return -E2BIG; + + desc->field_len[desc->field_count++] = len; + + return 0; +} + +static int nft_set_desc_concat(struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *attr; + int rem, err; + + nla_for_each_nested(attr, nla, rem) { + if (nla_type(attr) != NFTA_LIST_ELEM) + return -EINVAL; + + err = nft_set_desc_concat_parse(attr, desc); + if (err < 0) + return err; + } + + return 0; +} + static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { @@ -3810,8 +3891,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + if (da[NFTA_SET_DESC_CONCAT]) + err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); - return 0; + return err; } static int nf_tables_newset(struct net *net, struct sock *nlsk, @@ -3834,6 +3917,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned char *udata; u16 udlen; int err; + int i; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -4012,6 +4096,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->gc_int = gc_int; set->handle = nf_tables_alloc_handle(table); + set->field_count = desc.field_count; + for (i = 0; i < desc.field_count; i++) + set->field_len[i] = desc.field_len[i]; + err = ops->init(set, &desc, nla); if (err < 0) goto err3; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index a9f804f7a04a..5000b938ab1e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -466,6 +466,9 @@ static void nft_rbtree_destroy(const struct nft_set *set) static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (desc->field_count > 1) + return false; + if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); -- cgit v1.2.3 From 3c4287f62044a90e73a561aa05fc46e62da173da Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 22 Jan 2020 00:17:55 +0100 Subject: nf_tables: Add set type for arbitrary concatenation of ranges This new set type allows for intervals in concatenated fields, which are expressed in the usual way, that is, simple byte concatenation with padding to 32 bits for single fields, and given as ranges by specifying start and end elements containing, each, the full concatenation of start and end values for the single fields. Ranges are expanded to composing netmasks, for each field: these are inserted as rules in per-field lookup tables. Bits to be classified are divided in 4-bit groups, and for each group, the lookup table contains 4^2 buckets, representing all the possible values of a bit group. This approach was inspired by the Grouper algorithm: http://www.cse.usf.edu/~ligatti/projects/grouper/ Matching is performed by a sequence of AND operations between bucket values, with buckets selected according to the value of packet bits, for each group. The result of this sequence tells us which rules matched for a given field. In order to concatenate several ranged fields, per-field rules are mapped using mapping arrays, one per field, that specify which rules should be considered while matching the next field. The mapping array for the last field contains a reference to the element originally inserted. The notes in nft_set_pipapo.c cover the algorithm in deeper detail. A pure hash-based approach is of no use here, as ranges need to be classified. An implementation based on "proxying" the existing red-black tree set type, creating a tree for each field, was considered, but deemed impractical due to the fact that elements would need to be shared between trees, at least as long as we want to keep UAPI changes to a minimum. A stand-alone implementation of this algorithm is available at: https://pipapo.lameexcu.se together with notes about possible future optimisations (in pipapo.c). This algorithm was designed with data locality in mind, and can be highly optimised for SIMD instruction sets, as the bulk of the matching work is done with repetitive, simple bitwise operations. At this point, without further optimisations, nft_concat_range.sh reports, for one AMD Epyc 7351 thread (2.9GHz, 512 KiB L1D$, 8 MiB L2$): TEST: performance net,port [ OK ] baseline (drop from netdev hook): 10190076pps baseline hash (non-ranged entries): 6179564pps baseline rbtree (match on first field only): 2950341pps set with 1000 full, ranged entries: 2304165pps port,net [ OK ] baseline (drop from netdev hook): 10143615pps baseline hash (non-ranged entries): 6135776pps baseline rbtree (match on first field only): 4311934pps set with 100 full, ranged entries: 4131471pps net6,port [ OK ] baseline (drop from netdev hook): 9730404pps baseline hash (non-ranged entries): 4809557pps baseline rbtree (match on first field only): 1501699pps set with 1000 full, ranged entries: 1092557pps port,proto [ OK ] baseline (drop from netdev hook): 10812426pps baseline hash (non-ranged entries): 6929353pps baseline rbtree (match on first field only): 3027105pps set with 30000 full, ranged entries: 284147pps net6,port,mac [ OK ] baseline (drop from netdev hook): 9660114pps baseline hash (non-ranged entries): 3778877pps baseline rbtree (match on first field only): 3179379pps set with 10 full, ranged entries: 2082880pps net6,port,mac,proto [ OK ] baseline (drop from netdev hook): 9718324pps baseline hash (non-ranged entries): 3799021pps baseline rbtree (match on first field only): 1506689pps set with 1000 full, ranged entries: 783810pps net,mac [ OK ] baseline (drop from netdev hook): 10190029pps baseline hash (non-ranged entries): 5172218pps baseline rbtree (match on first field only): 2946863pps set with 1000 full, ranged entries: 1279122pps v4: - fix build for 32-bit architectures: 64-bit division needs div_u64() (kbuild test robot ) v3: - rework interface for field length specification, NFT_SET_SUBKEY disappears and information is stored in description - remove scratch area to store closing element of ranges, as elements now come with an actual attribute to specify the upper range limit (Pablo Neira Ayuso) - also remove pointer to 'start' element from mapping table, closing key is now accessible via extension data - use bytes right away instead of bits for field lengths, this way we can also double the inner loop of the lookup function to take care of upper and lower bits in a single iteration (minor performance improvement) - make it clearer that set operations are actually atomic API-wise, but we can't e.g. implement flush() as one-shot action - fix type for 'dup' in nft_pipapo_insert(), check for duplicates only in the next generation, and in general take care of differentiating generation mask cases depending on the operation (Pablo Neira Ayuso) - report C implementation matching rate in commit message, so that AVX2 implementation can be compared (Pablo Neira Ayuso) v2: - protect access to scratch maps in nft_pipapo_lookup() with local_bh_disable/enable() (Florian Westphal) - drop rcu_read_lock/unlock() from nft_pipapo_lookup(), it's already implied (Florian Westphal) - explain why partial allocation failures don't need handling in pipapo_realloc_scratch(), rename 'm' to clone and update related kerneldoc to make it clear we're not operating on the live copy (Florian Westphal) - add expicit check for priv->start_elem in nft_pipapo_insert() to avoid ending up in nft_pipapo_walk() with a NULL start element, and also zero it out in every operation that might make it invalid, so that insertion doesn't proceed with an invalid element (Florian Westphal) Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + net/netfilter/Makefile | 3 +- net/netfilter/nf_tables_set_core.c | 2 + net/netfilter/nft_set_pipapo.c | 2102 ++++++++++++++++++++++++++++++++ 4 files changed, 2107 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/nft_set_pipapo.c (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 2656155b4069..29e7e1021267 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -74,6 +74,7 @@ extern struct nft_set_type nft_set_hash_type; extern struct nft_set_type nft_set_hash_fast_type; extern struct nft_set_type nft_set_rbtree_type; extern struct nft_set_type nft_set_bitmap_type; +extern struct nft_set_type nft_set_pipapo_type; struct nft_expr; struct nft_regs; diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5e9b2eb24349..3f572e5a975e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -81,7 +81,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c index a9fce8d10051..586b621007eb 100644 --- a/net/netfilter/nf_tables_set_core.c +++ b/net/netfilter/nf_tables_set_core.c @@ -9,12 +9,14 @@ static int __init nf_tables_set_module_init(void) nft_register_set(&nft_set_rhash_type); nft_register_set(&nft_set_bitmap_type); nft_register_set(&nft_set_rbtree_type); + nft_register_set(&nft_set_pipapo_type); return 0; } static void __exit nf_tables_set_module_exit(void) { + nft_unregister_set(&nft_set_pipapo_type); nft_unregister_set(&nft_set_rbtree_type); nft_unregister_set(&nft_set_bitmap_type); nft_unregister_set(&nft_set_rhash_type); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c new file mode 100644 index 000000000000..f0cb1e13af50 --- /dev/null +++ b/net/netfilter/nft_set_pipapo.c @@ -0,0 +1,2102 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio + */ + +/** + * DOC: Theory of Operation + * + * + * Problem + * ------- + * + * Match packet bytes against entries composed of ranged or non-ranged packet + * field specifiers, mapping them to arbitrary references. For example: + * + * :: + * + * --- fields ---> + * | [net],[port],[net]... => [reference] + * entries [net],[port],[net]... => [reference] + * | [net],[port],[net]... => [reference] + * V ... + * + * where [net] fields can be IP ranges or netmasks, and [port] fields are port + * ranges. Arbitrary packet fields can be matched. + * + * + * Algorithm Overview + * ------------------ + * + * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally + * relies on the consideration that every contiguous range in a space of b bits + * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], + * as also illustrated in Section 9 of [Kogan 2014]. + * + * Classification against a number of entries, that require matching given bits + * of a packet field, is performed by grouping those bits in sets of arbitrary + * size, and classifying packet bits one group at a time. + * + * Example: + * to match the source port (16 bits) of a packet, we can divide those 16 bits + * in 4 groups of 4 bits each. Given the entry: + * 0000 0001 0101 1001 + * and a packet with source port: + * 0000 0001 1010 1001 + * first and second groups match, but the third doesn't. We conclude that the + * packet doesn't match the given entry. + * + * Translate the set to a sequence of lookup tables, one per field. Each table + * has two dimensions: bit groups to be matched for a single packet field, and + * all the possible values of said groups (buckets). Input entries are + * represented as one or more rules, depending on the number of composing + * netmasks for the given field specifier, and a group match is indicated as a + * set bit, with number corresponding to the rule index, in all the buckets + * whose value matches the entry for a given group. + * + * Rules are mapped between fields through an array of x, n pairs, with each + * item mapping a matched rule to one or more rules. The position of the pair in + * the array indicates the matched rule to be mapped to the next field, x + * indicates the first rule index in the next field, and n the amount of + * next-field rules the current rule maps to. + * + * The mapping array for the last field maps to the desired references. + * + * To match, we perform table lookups using the values of grouped packet bits, + * and use a sequence of bitwise operations to progressively evaluate rule + * matching. + * + * A stand-alone, reference implementation, also including notes about possible + * future optimisations, is available at: + * https://pipapo.lameexcu.se/ + * + * Insertion + * --------- + * + * - For each packet field: + * + * - divide the b packet bits we want to classify into groups of size t, + * obtaining ceil(b / t) groups + * + * Example: match on destination IP address, with t = 4: 32 bits, 8 groups + * of 4 bits each + * + * - allocate a lookup table with one column ("bucket") for each possible + * value of a group, and with one row for each group + * + * Example: 8 groups, 2^4 buckets: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 + * 1 + * 2 + * 3 + * 4 + * 5 + * 6 + * 7 + * + * - map the bits we want to classify for the current field, for a given + * entry, to a single rule for non-ranged and netmask set items, and to one + * or multiple rules for ranges. Ranges are expanded to composing netmasks + * by pipapo_expand(). + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 + * - rule #0: 10.0.0.5 + * - rule #1: 192.168.1.0/24 + * - rule #2: 192.168.2.0/31 + * + * - insert references to the rules in the lookup table, selecting buckets + * according to bit values of a rule in the given group. This is done by + * pipapo_insert(). + * + * Example: given: + * - rule #0: 10.0.0.5 mapping to buckets + * < 0 10 0 0 0 0 0 5 > + * - rule #1: 192.168.1.0/24 mapping to buckets + * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > + * - rule #2: 192.168.2.0/31 mapping to buckets + * < 12 0 10 8 0 2 0 < 0..1 > > + * + * these bits are set in the lookup table: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * - if this is not the last field in the set, fill a mapping array that maps + * rules from the lookup table to rules belonging to the same entry in + * the next lookup table, done by pipapo_map(). + * + * Note that as rules map to contiguous ranges of rules, given how netmask + * expansion and insertion is performed, &union nft_pipapo_map_bucket stores + * this information as pairs of first rule index, rule count. + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, + * given lookup table #0 for field 0 (see example above): + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * and lookup table #1 for field 1 with: + * - rule #0: 1024 mapping to buckets + * < 0 0 4 0 > + * - rule #1: 2048 mapping to buckets + * < 0 0 5 0 > + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 + * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 + * (rules #1, #2) to 2048 in lookup table #2 (rule #1): + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * - if this is the last field in the set, fill a mapping array that maps + * rules from the last lookup table to element pointers, also done by + * pipapo_map(). + * + * Note that, in this implementation, we have two elements (start, end) for + * each entry. The pointer to the end element is stored in this array, and + * the pointer to the start element is linked from it. + * + * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem + * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. + * From the rules of lookup table #1 as mapped above: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * + * Matching + * -------- + * + * We use a result bitmap, with the size of a single lookup table bucket, to + * represent the matching state that applies at every algorithm step. This is + * done by pipapo_lookup(). + * + * - For each packet field: + * + * - start with an all-ones result bitmap (res_map in pipapo_lookup()) + * + * - perform a lookup into the table corresponding to the current field, + * for each group, and at every group, AND the current result bitmap with + * the value from the lookup table bucket + * + * :: + * + * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from + * insertion examples. + * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for + * convenience in this example. Initial result bitmap is 0xff, the steps + * below show the value of the result bitmap after each group is processed: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 + * + * 1 1,2 0 + * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 + * + * 2 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 + * + * 3 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 + * + * 4 0,1,2 + * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 + * + * 5 0 1 2 + * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 + * + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 + * + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 + * + * - at the next field, start with a new, all-zeroes result bitmap. For each + * bit set in the previous result bitmap, fill the new result bitmap + * (fill_map in pipapo_lookup()) with the rule indices from the + * corresponding buckets of the mapping field for this field, done by + * pipapo_refill() + * + * Example: with mapping table from insertion examples, with the current + * result bitmap from the previous example, 0x02: + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be + * set. + * + * We can now extend this example to cover the second iteration of the step + * above (lookup and AND bitmap): assuming the port field is + * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table + * for "port" field from pre-computation example: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] + * & 0x3 [bucket 0], resulting bitmap is 0x2. + * + * - if this is the last field in the set, look up the value from the mapping + * array corresponding to the final result bitmap + * + * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for + * last field from insertion example: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * the matching element is at 0x42. + * + * + * References + * ---------- + * + * [Ligatti 2010] + * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with + * Automatic Time-space Tradeoffs + * Jay Ligatti, Josh Kuhn, and Chris Gage. + * Proceedings of the IEEE International Conference on Computer + * Communication Networks (ICCCN), August 2010. + * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * + * [Rottenstreich 2010] + * Worst-Case TCAM Rule Expansion + * Ori Rottenstreich and Isaac Keslassy. + * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf + * + * [Kogan 2014] + * SAX-PAC (Scalable And eXpressive PAcket Classification) + * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, + * and Patrick Eugster. + * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. + * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For the maximum length of a field */ +#include +#include + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Number of bits to be grouped together in lookup table buckets, arbitrary */ +#define NFT_PIPAPO_GROUP_BITS 4 +#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ + (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(x) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) + +/* Number of buckets, given by 2 ^ n, with n grouped bits */ +#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of 4-bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[0]; +}; + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @groups: Total amount of 4-bit groups for fields in this set + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int groups; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +/** + * pipapo_refill() - For each set bit, set bits from selected mapping table item + * @map: Bitmap to be scanned for set bits + * @len: Length of bitmap in longs + * @rules: Number of rules in field + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @match_only: Find a single bit and return, don't fill + * + * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. + * + * For each bit set in map, select the bucket from mapping table with index + * corresponding to the position of the bit set. Use start bit and amount of + * bits specified in bucket to fill region in dst. + * + * Return: -1 on no match, bit position on 'match_only', 0 otherwise. + */ +static int pipapo_refill(unsigned long *map, int len, int rules, + unsigned long *dst, union nft_pipapo_map_bucket *mt, + bool match_only) +{ + unsigned long bitset; + int k, ret = -1; + + for (k = 0; k < len; k++) { + bitset = map[k]; + while (bitset) { + unsigned long t = bitset & -bitset; + int r = __builtin_ctzl(bitset); + int i = k * BITS_PER_LONG + r; + + if (unlikely(i >= rules)) { + map[k] = 0; + return -1; + } + + if (unlikely(match_only)) { + bitmap_clear(map, i, 1); + return i; + } + + ret = 0; + + bitmap_set(dst, mt[i].to, mt[i].n); + + bitset ^= t; + } + map[k] = 0; + } + + return ret; +} + +/** + * nft_pipapo_lookup() - Lookup function + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation. + * + * Return: true on match, false otherwise. + */ +static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res_map, *fill_map; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i; + + local_bh_disable(); + + map_index = raw_cpu_read(nft_pipapo_scratch_index); + + m = rcu_dereference(priv->match); + + if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + goto out; + + res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); + fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group += 2) { + u8 v; + + v = *rp >> 4; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + + v = *rp & 0x0f; + rp++; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) { + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return false; + } + + if (last) { + *ext = &f->mt[b].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) + goto next_match; + + /* Last field: we're just returning the key without + * filling the initial bitmap for the next field, so the + * current inactive bitmap is clean and can be reused as + * *next* bitmap (not initial) for the next packet. + */ + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return true; + } + + /* Swap bitmap indices: res_map is the initial bitmap for the + * next field, and fill_map is guaranteed to be all-zeroes at + * this point. + */ + map_index = !map_index; + swap(res_map, fill_map); + + rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + } + +out: + local_bh_enable(); + return false; +} + +/** + * pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * + * This is essentially the same as the lookup function, except that it matches + * key data against the uncommitted copy and doesn't use preallocated maps for + * bitmap results. + * + * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct net *net, + const struct nft_set *set, + const u8 *data, u8 genmask) +{ + struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + unsigned long *res_map, *fill_map = NULL; + struct nft_pipapo_field *f; + int i; + + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!res_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!fill_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group++) { + u8 v; + + if (group % 2) { + v = *data & 0x0f; + data++; + } else { + v = *data >> 4; + } + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) + goto out; + + if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext) || + (genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + + ret = f->mt[b].e; + goto out; + } + + data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + + /* Swap bitmap indices: fill_map will be the initial bitmap for + * the next field (i.e. the new res_map), and res_map is + * guaranteed to be all-zeroes at this point, ready to be filled + * according to the next mapping table. + */ + swap(res_map, fill_map); + } + +out: + kfree(fill_map); + kfree(res_map); + return ret; +} + +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + */ +void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); +} + +/** + * pipapo_resize() - Resize lookup or mapping table, or both + * @f: Field containing lookup and mapping tables + * @old_rules: Previous amount of rules in field + * @rules: New amount of rules + * + * Increase, decrease or maintain tables size depending on new amount of rules, + * and copy data over. In case the new size is smaller, throw away data for + * highest-numbered rules. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ +static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +{ + long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; + union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; + size_t new_bucket_size, copy; + int group, bucket; + + new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); + + if (new_bucket_size == f->bsize) + goto mt; + + if (new_bucket_size > f->bsize) + copy = f->bsize; + else + copy = new_bucket_size; + + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * + sizeof(*new_lt), GFP_KERNEL); + if (!new_lt) + return -ENOMEM; + + new_p = new_lt; + old_p = old_lt; + for (group = 0; group < f->groups; group++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + memcpy(new_p, old_p, copy * sizeof(*new_p)); + new_p += copy; + old_p += copy; + + if (new_bucket_size > f->bsize) + new_p += new_bucket_size - f->bsize; + else + old_p += f->bsize - new_bucket_size; + } + } + +mt: + new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) { + kvfree(new_lt); + return -ENOMEM; + } + + memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } + + if (new_lt) { + f->bsize = new_bucket_size; + f->lt = new_lt; + kvfree(old_lt); + } + + f->mt = new_mt; + kvfree(old_mt); + + return 0; +} + +/** + * pipapo_bucket_set() - Set rule bit in bucket given group and group value + * @f: Field containing lookup table + * @rule: Rule index + * @group: Group index + * @v: Value of bit group + */ +static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, + int v) +{ + unsigned long *pos; + + pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos += f->bsize * v; + + __set_bit(rule, pos); +} + +/** + * pipapo_insert() - Insert new rule in field given input key and mask length + * @f: Field containing lookup table + * @k: Input key for classification, without nftables padding + * @mask_bits: Length of mask; matches field length for non-ranged entry + * + * Insert a new rule reference in lookup buckets corresponding to k and + * mask_bits. + * + * Return: 1 on success (one rule inserted), negative error code on failure. + */ +static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, + int mask_bits) +{ + int rule = f->rules++, group, ret; + + ret = pipapo_resize(f, f->rules - 1, f->rules); + if (ret) + return ret; + + for (group = 0; group < f->groups; group++) { + int i, v; + u8 mask; + + if (group % 2) + v = k[group / 2] & 0x0f; + else + v = k[group / 2] >> 4; + + if (mask_bits >= (group + 1) * 4) { + /* Not masked */ + pipapo_bucket_set(f, rule, group, v); + } else if (mask_bits <= group * 4) { + /* Completely masked */ + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + pipapo_bucket_set(f, rule, group, i); + } else { + /* The mask limit falls on this group */ + mask = 0x0f >> (mask_bits - group * 4); + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + if ((i & ~mask) == (v & ~mask)) + pipapo_bucket_set(f, rule, group, i); + } + } + } + + return 1; +} + +/** + * pipapo_step_diff() - Check if setting @step bit in netmask would change it + * @base: Mask we are expanding + * @step: Step bit for given expansion step + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if step bit changes mask (i.e. isn't set), false otherwise. + */ +static bool pipapo_step_diff(u8 *base, int step, int len) +{ + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); +#else + return !(BIT(step % BITS_PER_BYTE) & + base[len - 1 - step / BITS_PER_BYTE]); +#endif +} + +/** + * pipapo_step_after_end() - Check if mask exceeds range end with given step + * @base: Mask we are expanding + * @end: End of range + * @step: Step bit for given expansion step, highest bit to be set + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if mask exceeds range setting step bits, false otherwise. + */ +static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, + int len) +{ + u8 tmp[NFT_PIPAPO_MAX_BYTES]; + int i; + + memcpy(tmp, base, len); + + /* Network order, byte-addressed */ + for (i = 0; i <= step; i++) +#ifdef __BIG_ENDIAN__ + tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#else + tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#endif + + return memcmp(tmp, end, len) > 0; +} + +/** + * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry + * @base: Netmask base + * @step: Step bit to sum + * @len: Netmask length, bytes + */ +static void pipapo_base_sum(u8 *base, int step, int len) +{ + bool carry = false; + int i; + + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + for (i = step / BITS_PER_BYTE; i < len; i++) { +#else + for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { +#endif + if (carry) + base[i]++; + else + base[i] += 1 << (step % BITS_PER_BYTE); + + if (base[i]) + break; + + carry = true; + } +} + +/** + * pipapo_expand() - Expand to composing netmasks, insert into lookup table + * @f: Field containing lookup table + * @start: Start of range + * @end: End of range + * @len: Length of value in bits + * + * Expand range to composing netmasks and insert corresponding rule references + * in lookup buckets. + * + * Return: number of inserted rules on success, negative error code on failure. + */ +static int pipapo_expand(struct nft_pipapo_field *f, + const u8 *start, const u8 *end, int len) +{ + int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); + u8 base[NFT_PIPAPO_MAX_BYTES]; + + memcpy(base, start, bytes); + while (memcmp(base, end, bytes) <= 0) { + int err; + + step = 0; + while (pipapo_step_diff(base, step, bytes)) { + if (pipapo_step_after_end(base, end, step, bytes)) + break; + + step++; + if (step >= len) { + if (!masks) { + pipapo_insert(f, base, 0); + masks = 1; + } + goto out; + } + } + + err = pipapo_insert(f, base, len - step); + + if (err < 0) + return err; + + masks++; + pipapo_base_sum(base, step, bytes); + } +out: + return masks; +} + +/** + * pipapo_map() - Insert rules in mapping tables, mapping them between fields + * @m: Matching data, including mapping table + * @map: Table of rule maps: array of first rule and amount of rules + * in next field a given rule maps to, for each field + * @ext: For last field, nft_set_ext pointer matching rules map to + */ +static void pipapo_map(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], + struct nft_pipapo_elem *e) +{ + struct nft_pipapo_field *f; + int i, j; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { + for (j = 0; j < map[i].n; j++) { + f->mt[map[i].to + j].to = map[i + 1].to; + f->mt[map[i].to + j].n = map[i + 1].n; + } + } + + /* Last field: map to ext instead of mapping to next field */ + for (j = 0; j < map[i].n; j++) + f->mt[map[i].to + j].e = e; +} + +/** + * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results + * @clone: Copy of matching data with pending insertions and deletions + * @bsize_max Maximum bucket size, scratch maps cover two buckets + * + * Return: 0 on success, -ENOMEM on failure. + */ +static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, + unsigned long bsize_max) +{ + int i; + + for_each_possible_cpu(i) { + unsigned long *scratch; + + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + GFP_KERNEL, cpu_to_node(i)); + if (!scratch) { + /* On failure, there's no need to undo previous + * allocations: this means that some scratch maps have + * a bigger allocated size now (this is only called on + * insertion), but the extra space won't be used by any + * CPU as new elements are not inserted and m->bsize_max + * is not updated. + */ + return -ENOMEM; + } + + kfree(*per_cpu_ptr(clone->scratch, i)); + + *per_cpu_ptr(clone->scratch, i) = scratch; + } + + return 0; +} + +/** + * nft_pipapo_insert() - Validate and insert ranged elements + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * + * Return: 0 on success, error pointer on failure. + */ +static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext2) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *start = (const u8 *)elem->key.val.data, *end; + struct nft_pipapo_elem *e = elem->priv, *dup; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + u8 genmask = nft_genmask_next(net); + struct nft_pipapo_field *f; + int i, bsize_max, err = 0; + + dup = pipapo_get(net, set, start, genmask); + if (PTR_ERR(dup) == -ENOENT) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { + end = (const u8 *)nft_set_ext_key_end(ext)->data; + dup = pipapo_get(net, set, end, nft_genmask_next(net)); + } else { + end = start; + } + } + + if (PTR_ERR(dup) != -ENOENT) { + if (IS_ERR(dup)) + return PTR_ERR(dup); + *ext2 = &dup->ext; + return -EEXIST; + } + + /* Validate */ + nft_pipapo_for_each_field(f, i, m) { + const u8 *start_p = start, *end_p = end; + + if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; + + if (memcmp(start_p, end_p, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + return -EINVAL; + + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + /* Insert */ + priv->dirty = true; + + bsize_max = m->bsize_max; + + nft_pipapo_for_each_field(f, i, m) { + int ret; + + rulemap[i].to = f->rules; + + ret = memcmp(start, end, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + if (!ret) { + ret = pipapo_insert(f, start, + f->groups * NFT_PIPAPO_GROUP_BITS); + } else { + ret = pipapo_expand(f, start, end, + f->groups * NFT_PIPAPO_GROUP_BITS); + } + + if (f->bsize > bsize_max) + bsize_max = f->bsize; + + rulemap[i].n = ret; + + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + err = pipapo_realloc_scratch(m, bsize_max); + if (err) + return err; + + this_cpu_write(nft_pipapo_scratch_index, false); + + m->bsize_max = bsize_max; + } + + *ext2 = &e->ext; + + pipapo_map(m, rulemap, e); + + return 0; +} + +/** + * pipapo_clone() - Clone matching data to create new working copy + * @old: Existing matching data + * + * Return: copy of matching data passed as 'old', error pointer on failure + */ +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) +{ + struct nft_pipapo_field *dst, *src; + struct nft_pipapo_match *new; + int i; + + new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, + GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + new->field_count = old->field_count; + new->bsize_max = old->bsize_max; + + new->scratch = alloc_percpu(*new->scratch); + if (!new->scratch) + goto out_scratch; + + rcu_head_init(&new->rcu); + + src = old->f; + dst = new->f; + + for (i = 0; i < old->field_count; i++) { + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); + + dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * + src->bsize * sizeof(*dst->lt), + GFP_KERNEL); + if (!dst->lt) + goto out_lt; + + memcpy(dst->lt, src->lt, + src->bsize * sizeof(*dst->lt) * + src->groups * NFT_PIPAPO_BUCKETS); + + dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + src++; + dst++; + } + + return new; + +out_mt: + kvfree(dst->lt); +out_lt: + for (dst--; i > 0; i--) { + kvfree(dst->mt); + kvfree(dst->lt); + dst--; + } + free_percpu(new->scratch); +out_scratch: + kfree(new); + + return ERR_PTR(-ENOMEM); +} + +/** + * pipapo_rules_same_key() - Get number of rules originated from the same entry + * @f: Field containing mapping table + * @first: Index of first rule in set of rules mapping to same entry + * + * Using the fact that all rules in a field that originated from the same entry + * will map to the same set of rules in the next field, or to the same element + * reference, return the cardinality of the set of rules that originated from + * the same entry as the rule with index @first, @first rule included. + * + * In pictures: + * rules + * field #0 0 1 2 3 4 + * map to: 0 1 2-4 2-4 5-9 + * . . ....... . ... + * | | | | \ \ + * | | | | \ \ + * | | | | \ \ + * ' ' ' ' ' \ + * in field #1 0 1 2 3 4 5 ... + * + * if this is called for rule 2 on field #0, it will return 3, as also rules 2 + * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. + * + * For the last field in a set, we can rely on associated entries to map to the + * same element references. + * + * Return: Number of rules that originated from the same entry as @first. + */ +static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +{ + struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ + int r; + + for (r = first; r < f->rules; r++) { + if (r != first && e != f->mt[r].e) + return r - first; + + e = f->mt[r].e; + } + + if (r != first) + return r - first; + + return 0; +} + +/** + * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones + * @mt: Mapping array + * @rules: Original amount of rules in mapping table + * @start: First rule index to be removed + * @n: Amount of rules to be removed + * @to_offset: First rule index, in next field, this group of rules maps to + * @is_last: If this is the last field, delete reference from mapping array + * + * This is used to unmap rules from the mapping table for a single field, + * maintaining consistency and compactness for the existing ones. + * + * In pictures: let's assume that we want to delete rules 2 and 3 from the + * following mapping array: + * + * rules + * 0 1 2 3 4 + * map to: 4-10 4-10 11-15 11-15 16-18 + * + * the result will be: + * + * rules + * 0 1 2 + * map to: 4-10 4-10 11-13 + * + * for fields before the last one. In case this is the mapping table for the + * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: + * + * rules + * 0 1 2 3 4 + * element pointers: 0x42 0x42 0x33 0x33 0x44 + * + * the result will be: + * + * rules + * 0 1 2 + * element pointers: 0x42 0x42 0x44 + */ +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, + int start, int n, int to_offset, bool is_last) +{ + int i; + + memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); + memset(mt + rules - n, 0, n * sizeof(*mt)); + + if (is_last) + return; + + for (i = start; i < rules - n; i++) + mt[i].to -= to_offset; +} + +/** + * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map + * @m: Matching data + * @rulemap Table of rule maps, arrays of first rule and amount of rules + * in next field a given entry maps to, for each field + * + * For each rule in lookup table buckets mapping to this set of rules, drop + * all bits set in lookup table mapping. In pictures, assuming we want to drop + * rules 0 and 1 from this lookup table: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * rule 2 becomes rule 0, and the result will be: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 + * 1 0 + * 2 0 + * 3 0 + * 4 0 + * 5 0 + * 6 0 + * 7 0 0 + * + * once this is done, call unmap() to drop all the corresponding rule references + * from mapping tables. + */ +static void pipapo_drop(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket rulemap[]) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + int g; + + for (g = 0; g < f->groups; g++) { + unsigned long *pos; + int b; + + pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + bitmap_cut(pos, pos, rulemap[i].to, + rulemap[i].n, + f->bsize * BITS_PER_LONG); + + pos += f->bsize; + } + } + + pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, + rulemap[i + 1].n, i == m->field_count - 1); + if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { + /* We can ignore this, a failure to shrink tables down + * doesn't make tables invalid. + */ + ; + } + f->rules -= rulemap[i].n; + } +} + +/** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * @set: nftables API set representation + * @m: Matching data + */ +static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +{ + struct nft_pipapo *priv = nft_set_priv(set); + int rules_f0, first_rule = 0; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + struct nft_pipapo_field *f; + struct nft_pipapo_elem *e; + int i, start, rules_fx; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + if (i < m->field_count - 1) { + rules_fx = f->mt[start].n; + start = f->mt[start].to; + } + } + + /* Pick the last field, and its last index */ + f--; + i--; + e = f->mt[rulemap[i].to].e; + if (nft_set_elem_expired(&e->ext) && + !nft_set_elem_mark_busy(&e->ext)) { + priv->dirty = true; + pipapo_drop(m, rulemap); + + rcu_barrier(); + nft_set_elem_destroy(set, e, true); + + /* And check again current first rule, which is now the + * first we haven't checked. + */ + } else { + first_rule += rules_f0; + } + } + + priv->last_gc = jiffies; +} + +/** + * pipapo_free_fields() - Free per-field tables contained in matching data + * @m: Matching data + */ +static void pipapo_free_fields(struct nft_pipapo_match *m) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + kvfree(f->lt); + kvfree(f->mt); + } +} + +/** + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + int i; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + + for_each_possible_cpu(i) + kfree(*per_cpu_ptr(m->scratch, i)); + + free_percpu(m->scratch); + + pipapo_free_fields(m); + + kfree(m); +} + +/** + * pipapo_commit() - Replace lookup data with current working copy + * @set: nftables API set representation + * + * While at it, check if we should perform garbage collection on the working + * copy before committing it for lookup, and don't replace the table if the + * working copy doesn't have pending changes. + * + * We also need to create a new working copy for subsequent insertions and + * deletions. + */ +static void pipapo_commit(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *old; + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); + + if (!priv->dirty) + return; + + new_clone = pipapo_clone(priv->clone); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + old = rcu_access_pointer(priv->match); + rcu_assign_pointer(priv->match, priv->clone); + if (old) + call_rcu(&old->rcu, pipapo_reclaim_match); + + priv->clone = new_clone; +} + +/** + * nft_pipapo_activate() - Mark element reference as active given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * On insertion, elements are added to a copy of the matching data currently + * in use for lookups, and not directly inserted into current lookup data, so + * we'll take care of that by calling pipapo_commit() here. Both + * nft_pipapo_insert() and nft_pipapo_activate() are called once for each + * element, hence we can't purpose either one as a real commit operation. + */ +static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); + if (IS_ERR(e)) + return; + + nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); + + pipapo_commit(set); +} + +/** + * pipapo_deactivate() - Check that element is in set, mark as inactive + * @net: Network namespace + * @set: nftables API set representation + * @data: Input key data + * @ext: nftables API extension pointer, used to check for end element + * + * This is a convenience function that can be called from both + * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same + * operation. + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, + const u8 *data, const struct nft_set_ext *ext) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, nft_genmask_next(net)); + if (IS_ERR(e)) + return NULL; + + nft_set_elem_change_active(net, set, &e->ext); + + return e; +} + +/** + * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *nft_pipapo_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); +} + +/** + * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * This is functionally the same as nft_pipapo_deactivate(), with a slightly + * different interface, and it's also called once for each element in a set + * being flushed, so we can't implement, strictly speaking, a flush operation, + * which would otherwise be as simple as allocating an empty copy of the + * matching data. + * + * Note that we could in theory do that, mark the set as flushed, and ignore + * subsequent calls, but we would leak all the elements after the first one, + * because they wouldn't then be freed as result of API calls. + * + * Return: true if element was found and deactivated. + */ +static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, + void *elem) +{ + struct nft_pipapo_elem *e = elem; + + return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), + &e->ext); +} + +/** + * pipapo_get_boundaries() - Get byte interval for associated rules + * @f: Field including lookup table + * @first_rule: First rule (lowest index) + * @rule_count: Number of associated rules + * @left: Byte expression for left boundary (start of range) + * @right: Byte expression for right boundary (end of range) + * + * Given the first rule and amount of rules that originated from the same entry, + * build the original range associated with the entry, and calculate the length + * of the originating netmask. + * + * In pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 1,2 + * 1 1,2 + * 2 1,2 + * 3 1,2 + * 4 1,2 + * 5 1 2 + * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * + * this is the lookup table corresponding to the IPv4 range + * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, + * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. + * + * This function fills @left and @right with the byte values of the leftmost + * and rightmost bucket indices for the lowest and highest rule indices, + * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in + * nibbles: + * left: < 12, 0, 10, 8, 0, 1, 0, 0 > + * right: < 12, 0, 10, 8, 0, 2, 2, 1 > + * corresponding to bytes: + * left: < 192, 168, 1, 0 > + * right: < 192, 168, 2, 1 > + * with mask length irrelevant here, unused on return, as the range is already + * defined by its start and end points. The mask length is relevant for a single + * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore + * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes + * < 192, 168, 1, 255 >, and the mask length, calculated from the distances + * between leftmost and rightmost bucket indices for each group, would be 24. + * + * Return: mask length, in bits. + */ +static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, + int rule_count, u8 *left, u8 *right) +{ + u8 *l = left, *r = right; + int g, mask_len = 0; + + for (g = 0; g < f->groups; g++) { + int b, x0, x1; + + x0 = -1; + x1 = -1; + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + unsigned long *pos; + + pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + if (test_bit(first_rule, pos) && x0 == -1) + x0 = b; + if (test_bit(first_rule + rule_count - 1, pos)) + x1 = b; + } + + if (g % 2) { + *(l++) |= x0 & 0x0f; + *(r++) |= x1 & 0x0f; + } else { + *l |= x0 << 4; + *r |= x1 << 4; + } + + if (x1 - x0 == 0) + mask_len += 4; + else if (x1 - x0 == 1) + mask_len += 3; + else if (x1 - x0 == 3) + mask_len += 2; + else if (x1 - x0 == 7) + mask_len += 1; + } + + return mask_len; +} + +/** + * pipapo_match_field() - Match rules against byte ranges + * @f: Field including the lookup table + * @first_rule: First of associated rules originating from same entry + * @rule_count: Amount of associated rules + * @start: Start of range to be matched + * @end: End of range to be matched + * + * Return: true on match, false otherwise. + */ +static bool pipapo_match_field(struct nft_pipapo_field *f, + int first_rule, int rule_count, + const u8 *start, const u8 *end) +{ + u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; + u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; + + pipapo_get_boundaries(f, first_rule, rule_count, left, right); + + return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); +} + +/** + * nft_pipapo_remove() - Remove element given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Similarly to nft_pipapo_activate(), this is used as commit operation by the + * API, but it's called once per element in the pending transaction, so we can't + * implement this as a single commit operation. Closest we can get is to remove + * the matched element here, if any, and commit the updated matching data. + */ +static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const u8 *data = (const u8 *)elem->key.val.data; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, 0); + if (IS_ERR(e)) + return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; + struct nft_pipapo_field *f; + int i, start, rules_fx; + + match_start = data; + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + if (!pipapo_match_field(f, start, rules_fx, + match_start, match_end)) + break; + + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + rules_fx = f->mt[start].n; + start = f->mt[start].to; + + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (i == m->field_count) { + priv->dirty = true; + pipapo_drop(m, rulemap); + pipapo_commit(set); + return; + } + + first_rule += rules_f0; + } +} + +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * As elements are referenced in the mapping array for the last field, directly + * scan that array: there's no need to follow rule mappings from the first + * field. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r; + + rcu_read_lock(); + m = rcu_dereference(priv->match); + + if (unlikely(!m)) + goto out; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + struct nft_set_elem elem; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + if (iter->count < iter->skip) + goto cont; + + e = f->mt[r].e; + if (nft_set_elem_expired(&e->ext)) + goto cont; + + elem.priv = e; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + goto out; + +cont: + iter->count++; + } + +out: + rcu_read_unlock(); +} + +/** + * nft_pipapo_privsize() - Return the size of private data for the set + * @nla: netlink attributes, ignored as size doesn't depend on them + * @desc: Set description, ignored as size doesn't depend on it + * + * Return: size of private data for this set implementation, in bytes + */ +static u64 nft_pipapo_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) +{ + return sizeof(struct nft_pipapo); +} + +/** + * nft_pipapo_estimate() - Estimate set size, space and lookup complexity + * @desc: Set description, element count and field description used here + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: true only for compatible range concatenations + */ +static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned long entry_size; + int i; + + if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + return false; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return false; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + est->size = desc->size * entry_size; + if (est->size && div_u64(est->size, desc->size) != entry_size) + return false; + + est->size += sizeof(struct nft_pipapo) + + sizeof(struct nft_pipapo_match) * 2; + + est->size += sizeof(struct nft_pipapo_field) * desc->field_count; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_init() - Initialise data for a set instance + * @set: nftables API set representation + * @desc: Set description + * @nla: netlink attributes + * + * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink + * attributes, initialise internal set parameters, current instance of matching + * data and a copy for subsequent insertions. + * + * Return: 0 on success, negative error code on failure. + */ +static int nft_pipapo_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int err, i; + + if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + return -EINVAL; + + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + GFP_KERNEL); + if (!m) + return -ENOMEM; + + m->field_count = desc->field_count; + m->bsize_max = 0; + + m->scratch = alloc_percpu(unsigned long *); + if (!m->scratch) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch, i) = NULL; + + rcu_head_init(&m->rcu); + + nft_pipapo_for_each_field(f, i, m) { + f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; + priv->groups += f->groups; + + priv->width += round_up(desc->field_len[i], sizeof(u32)); + + f->bsize = 0; + f->rules = 0; + f->lt = NULL; + f->mt = NULL; + } + + /* Create an initial clone of matching data for next insertion */ + priv->clone = pipapo_clone(m); + if (IS_ERR(priv->clone)) { + err = PTR_ERR(priv->clone); + goto out_free; + } + + priv->dirty = false; + + rcu_assign_pointer(priv->match, m); + + return 0; + +out_free: + free_percpu(m->scratch); + kfree(m); + + return err; +} + +/** + * nft_pipapo_destroy() - Free private data for set and all committed elements + * @set: nftables API set representation + */ +static void nft_pipapo_destroy(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r, cpu; + + m = rcu_dereference_protected(priv->match, true); + if (m) { + rcu_barrier(); + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + e = f->mt[r].e; + + nft_set_elem_destroy(set, e, true); + } + + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(m->scratch, cpu)); + free_percpu(m->scratch); + + pipapo_free_fields(m); + kfree(m); + priv->match = NULL; + } + + if (priv->clone) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + free_percpu(priv->clone->scratch); + + pipapo_free_fields(priv->clone); + kfree(priv->clone); + priv->clone = NULL; + } +} + +/** + * nft_pipapo_gc_init() - Initialise garbage collection + * @set: nftables API set representation + * + * Instead of actually setting up a periodic work for garbage collection, as + * this operation requires a swap of matching data with the working copy, we'll + * do that opportunistically with other commit operations if the interval is + * elapsed, so we just need to set the current jiffies timestamp here. + */ +static void nft_pipapo_gc_init(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +struct nft_set_type nft_set_pipapo_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; -- cgit v1.2.3 From 2e24cd755552350b94a7617617c6877b8cbcb701 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 23 Jan 2020 16:26:18 -0800 Subject: net_sched: fix ops->bind_class() implementations The current implementations of ops->bind_class() are merely searching for classid and updating class in the struct tcf_result, without invoking either of cl_ops->bind_tcf() or cl_ops->unbind_tcf(). This breaks the design of them as qdisc's like cbq use them to count filters too. This is why syzbot triggered the warning in cbq_destroy_class(). In order to fix this, we have to call cl_ops->bind_tcf() and cl_ops->unbind_tcf() like the filter binding path. This patch does so by refactoring out two helper functions __tcf_bind_filter() and __tcf_unbind_filter(), which are lockless and accept a Qdisc pointer, then teaching each implementation to call them correctly. Note, we merely pass the Qdisc pointer as an opaque pointer to each filter, they only need to pass it down to the helper functions without understanding it at all. Fixes: 07d79fc7d94e ("net_sched: add reverse binding for tc class") Reported-and-tested-by: syzbot+0a0596220218fcb603a8@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+63bdb6006961d8c917c6@syzkaller.appspotmail.com Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 33 +++++++++++++++++++-------------- include/net/sch_generic.h | 3 ++- net/sched/cls_basic.c | 11 ++++++++--- net/sched/cls_bpf.c | 11 ++++++++--- net/sched/cls_flower.c | 11 ++++++++--- net/sched/cls_fw.c | 11 ++++++++--- net/sched/cls_matchall.c | 11 ++++++++--- net/sched/cls_route.c | 11 ++++++++--- net/sched/cls_rsvp.h | 11 ++++++++--- net/sched/cls_tcindex.c | 11 ++++++++--- net/sched/cls_u32.c | 11 ++++++++--- net/sched/sch_api.c | 6 ++++-- 12 files changed, 97 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index ce036492986a..a972244ab193 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -141,31 +141,38 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -static inline unsigned long -cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl) +static inline void +__tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { - unsigned long old_cl; + unsigned long cl; - sch_tree_lock(q); - old_cl = __cls_set_class(clp, cl); - sch_tree_unlock(q); - return old_cl; + cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); + cl = __cls_set_class(&r->class, cl); + if (cl) + q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; - unsigned long cl; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; - cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); - cl = cls_set_class(q, &r->class, cl); - if (cl) + sch_tree_lock(q); + __tcf_bind_filter(q, r, base); + sch_tree_unlock(q); +} + +static inline void +__tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) +{ + unsigned long cl; + + if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } @@ -173,12 +180,10 @@ static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; - unsigned long cl; if (!q) return; - if ((cl = __cls_set_class(&r->class, 0)) != 0) - q->ops->cl_ops->unbind_tcf(q, cl); + __tcf_unbind_filter(q, r); } struct tcf_exts { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index fceddf89592a..151208704ed2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -318,7 +318,8 @@ struct tcf_proto_ops { void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); - void (*bind_class)(void *, u32, unsigned long); + void (*bind_class)(void *, u32, unsigned long, + void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 4aafbe3d435c..f256a7c69093 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -263,12 +263,17 @@ skip: } } -static void basic_bind_class(void *fh, u32 classid, unsigned long cl) +static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct basic_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 8229ed4a67be..6e3e63db0e01 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -631,12 +631,17 @@ nla_put_failure: return -1; } -static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl) +static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct cls_bpf_prog *prog = fh; - if (prog && prog->res.classid == classid) - prog->res.class = cl; + if (prog && prog->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &prog->res, base); + else + __tcf_unbind_filter(q, &prog->res); + } } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b0f42e62dd76..f9c0d1e8d380 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2765,12 +2765,17 @@ nla_put_failure: return -EMSGSIZE; } -static void fl_bind_class(void *fh, u32 classid, unsigned long cl) +static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_fl_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static bool fl_delete_empty(struct tcf_proto *tp) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index c9496c920d6f..ec945294626a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -419,12 +419,17 @@ nla_put_failure: return -1; } -static void fw_bind_class(void *fh, u32 classid, unsigned long cl) +static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct fw_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_fw_ops __read_mostly = { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 7fc2eb62aa98..039cc86974f4 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -393,12 +393,17 @@ nla_put_failure: return -1; } -static void mall_bind_class(void *fh, u32 classid, unsigned long cl) +static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct cls_mall_head *head = fh; - if (head && head->res.classid == classid) - head->res.class = cl; + if (head && head->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &head->res, base); + else + __tcf_unbind_filter(q, &head->res); + } } static struct tcf_proto_ops cls_mall_ops __read_mostly = { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 2d9e0b4484ea..6f8786b06bde 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -641,12 +641,17 @@ nla_put_failure: return -1; } -static void route4_bind_class(void *fh, u32 classid, unsigned long cl) +static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct route4_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops cls_route4_ops __read_mostly = { diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2f3c03b25d5d..c22624131949 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -738,12 +738,17 @@ nla_put_failure: return -1; } -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl) +static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct rsvp_filter *f = fh; - if (f && f->res.classid == classid) - f->res.class = cl; + if (f && f->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &f->res, base); + else + __tcf_unbind_filter(q, &f->res); + } } static struct tcf_proto_ops RSVP_OPS __read_mostly = { diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e573e5a5c794..3d4a1280352f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -654,12 +654,17 @@ nla_put_failure: return -1; } -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl) +static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, + void *q, unsigned long base) { struct tcindex_filter_result *r = fh; - if (r && r->res.classid == classid) - r->res.class = cl; + if (r && r->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &r->res, base); + else + __tcf_unbind_filter(q, &r->res); + } } static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index a0e6fac613de..e15ff335953d 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1255,12 +1255,17 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, return 0; } -static void u32_bind_class(void *fh, u32 classid, unsigned long cl) +static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, + unsigned long base) { struct tc_u_knode *n = fh; - if (n && n->res.classid == classid) - n->res.class = cl; + if (n && n->res.classid == classid) { + if (cl) + __tcf_bind_filter(q, &n->res, base); + else + __tcf_unbind_filter(q, &n->res); + } } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1047825d9f48..943ad3425380 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1891,8 +1891,9 @@ static int tclass_del_notify(struct net *net, struct tcf_bind_args { struct tcf_walker w; - u32 classid; + unsigned long base; unsigned long cl; + u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -1903,7 +1904,7 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); - tp->ops->bind_class(n, a->classid, a->cl); + tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; @@ -1936,6 +1937,7 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, arg.w.fn = tcf_node_bind; arg.classid = clid; + arg.base = cl; arg.cl = new_cl; tp->ops->walk(tp, &arg.w, true); } -- cgit v1.2.3 From 760d228e322e99cdf6d81b4b60a268b8f13cf67a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 23 Jan 2020 17:27:08 -0800 Subject: net_sched: walk through all child classes in tc_bind_tclass() In a complex TC class hierarchy like this: tc qdisc add dev eth0 root handle 1:0 cbq bandwidth 100Mbit \ avpkt 1000 cell 8 tc class add dev eth0 parent 1:0 classid 1:1 cbq bandwidth 100Mbit \ rate 6Mbit weight 0.6Mbit prio 8 allot 1514 cell 8 maxburst 20 \ avpkt 1000 bounded tc filter add dev eth0 parent 1:0 protocol ip prio 1 u32 match ip \ sport 80 0xffff flowid 1:3 tc filter add dev eth0 parent 1:0 protocol ip prio 1 u32 match ip \ sport 25 0xffff flowid 1:4 tc class add dev eth0 parent 1:1 classid 1:3 cbq bandwidth 100Mbit \ rate 5Mbit weight 0.5Mbit prio 5 allot 1514 cell 8 maxburst 20 \ avpkt 1000 tc class add dev eth0 parent 1:1 classid 1:4 cbq bandwidth 100Mbit \ rate 3Mbit weight 0.3Mbit prio 5 allot 1514 cell 8 maxburst 20 \ avpkt 1000 where filters are installed on qdisc 1:0, so we can't merely search from class 1:1 when creating class 1:3 and class 1:4. We have to walk through all the child classes of the direct parent qdisc. Otherwise we would miss filters those need reverse binding. Fixes: 07d79fc7d94e ("net_sched: add reverse binding for tc class") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_api.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 943ad3425380..50794125bf02 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1910,22 +1910,24 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return 0; } -static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, - unsigned long new_cl) +struct tc_bind_class_args { + struct qdisc_walker w; + unsigned long new_cl; + u32 portid; + u32 clid; +}; + +static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *w) { + struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; - unsigned long cl; - cl = cops->find(q, portid); - if (!cl) - return; - if (!cops->tcf_block) - return; block = cops->tcf_block(q, cl, NULL); if (!block) - return; + return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { @@ -1936,12 +1938,29 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; - arg.classid = clid; + arg.classid = a->clid; arg.base = cl; - arg.cl = new_cl; + arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } + + return 0; +} + +static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, + unsigned long new_cl) +{ + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct tc_bind_class_args args = {}; + + if (!cops->tcf_block) + return; + args.portid = portid; + args.clid = clid; + args.new_cl = new_cl; + args.w.fn = tc_bind_class_walker; + q->ops->cl_ops->walk(q, &args.w); } #else -- cgit v1.2.3 From f9e95555757915fc194288862d2978e370fe316b Mon Sep 17 00:00:00 2001 From: Stephen Worley Date: Fri, 24 Jan 2020 16:53:27 -0500 Subject: net: include struct nhmsg size in nh nlmsg size Include the size of struct nhmsg size when calculating how much of a payload to allocate in a new netlink nexthop notification message. Without this, we will fail to fill the skbuff at certain nexthop group sizes. You can reproduce the failure with the following iproute2 commands: ip link add dummy1 type dummy ip link add dummy2 type dummy ip link add dummy3 type dummy ip link add dummy4 type dummy ip link add dummy5 type dummy ip link add dummy6 type dummy ip link add dummy7 type dummy ip link add dummy8 type dummy ip link add dummy9 type dummy ip link add dummy10 type dummy ip link add dummy11 type dummy ip link add dummy12 type dummy ip link add dummy13 type dummy ip link add dummy14 type dummy ip link add dummy15 type dummy ip link add dummy16 type dummy ip link add dummy17 type dummy ip link add dummy18 type dummy ip link add dummy19 type dummy ip ro add 1.1.1.1/32 dev dummy1 ip ro add 1.1.1.2/32 dev dummy2 ip ro add 1.1.1.3/32 dev dummy3 ip ro add 1.1.1.4/32 dev dummy4 ip ro add 1.1.1.5/32 dev dummy5 ip ro add 1.1.1.6/32 dev dummy6 ip ro add 1.1.1.7/32 dev dummy7 ip ro add 1.1.1.8/32 dev dummy8 ip ro add 1.1.1.9/32 dev dummy9 ip ro add 1.1.1.10/32 dev dummy10 ip ro add 1.1.1.11/32 dev dummy11 ip ro add 1.1.1.12/32 dev dummy12 ip ro add 1.1.1.13/32 dev dummy13 ip ro add 1.1.1.14/32 dev dummy14 ip ro add 1.1.1.15/32 dev dummy15 ip ro add 1.1.1.16/32 dev dummy16 ip ro add 1.1.1.17/32 dev dummy17 ip ro add 1.1.1.18/32 dev dummy18 ip ro add 1.1.1.19/32 dev dummy19 ip next add id 1 via 1.1.1.1 dev dummy1 ip next add id 2 via 1.1.1.2 dev dummy2 ip next add id 3 via 1.1.1.3 dev dummy3 ip next add id 4 via 1.1.1.4 dev dummy4 ip next add id 5 via 1.1.1.5 dev dummy5 ip next add id 6 via 1.1.1.6 dev dummy6 ip next add id 7 via 1.1.1.7 dev dummy7 ip next add id 8 via 1.1.1.8 dev dummy8 ip next add id 9 via 1.1.1.9 dev dummy9 ip next add id 10 via 1.1.1.10 dev dummy10 ip next add id 11 via 1.1.1.11 dev dummy11 ip next add id 12 via 1.1.1.12 dev dummy12 ip next add id 13 via 1.1.1.13 dev dummy13 ip next add id 14 via 1.1.1.14 dev dummy14 ip next add id 15 via 1.1.1.15 dev dummy15 ip next add id 16 via 1.1.1.16 dev dummy16 ip next add id 17 via 1.1.1.17 dev dummy17 ip next add id 18 via 1.1.1.18 dev dummy18 ip next add id 19 via 1.1.1.19 dev dummy19 ip next add id 1111 group 1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19 ip next del id 1111 Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: Stephen Worley Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 511eaa94e2d1..d072c326dd64 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -321,7 +321,9 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh) static size_t nh_nlmsg_size(struct nexthop *nh) { - size_t sz = nla_total_size(4); /* NHA_ID */ + size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); + + sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); -- cgit v1.2.3 From 55cd9f67f1e45de8517cdaab985fb8e56c0bc1d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Jan 2020 14:57:20 -0800 Subject: net_sched: ematch: reject invalid TCF_EM_SIMPLE It is possible for malicious userspace to set TCF_EM_SIMPLE bit even for matches that should not have this bit set. This can fool two places using tcf_em_is_simple() 1) tcf_em_tree_destroy() -> memory leak of em->data if ops->destroy() is NULL 2) tcf_em_tree_dump() wrongly report/leak 4 low-order bytes of a kernel pointer. BUG: memory leak unreferenced object 0xffff888121850a40 (size 32): comm "syz-executor927", pid 7193, jiffies 4294941655 (age 19.840s) hex dump (first 32 bytes): 00 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f67036ea>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<00000000f67036ea>] slab_post_alloc_hook mm/slab.h:586 [inline] [<00000000f67036ea>] slab_alloc mm/slab.c:3320 [inline] [<00000000f67036ea>] __do_kmalloc mm/slab.c:3654 [inline] [<00000000f67036ea>] __kmalloc_track_caller+0x165/0x300 mm/slab.c:3671 [<00000000fab0cc8e>] kmemdup+0x27/0x60 mm/util.c:127 [<00000000d9992e0a>] kmemdup include/linux/string.h:453 [inline] [<00000000d9992e0a>] em_nbyte_change+0x5b/0x90 net/sched/em_nbyte.c:32 [<000000007e04f711>] tcf_em_validate net/sched/ematch.c:241 [inline] [<000000007e04f711>] tcf_em_tree_validate net/sched/ematch.c:359 [inline] [<000000007e04f711>] tcf_em_tree_validate+0x332/0x46f net/sched/ematch.c:300 [<000000007a769204>] basic_set_parms net/sched/cls_basic.c:157 [inline] [<000000007a769204>] basic_change+0x1d7/0x5f0 net/sched/cls_basic.c:219 [<00000000e57a5997>] tc_new_tfilter+0x566/0xf70 net/sched/cls_api.c:2104 [<0000000074b68559>] rtnetlink_rcv_msg+0x3b2/0x4b0 net/core/rtnetlink.c:5415 [<00000000b7fe53fb>] netlink_rcv_skb+0x61/0x170 net/netlink/af_netlink.c:2477 [<00000000e83a40d0>] rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5442 [<00000000d62ba933>] netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] [<00000000d62ba933>] netlink_unicast+0x223/0x310 net/netlink/af_netlink.c:1328 [<0000000088070f72>] netlink_sendmsg+0x2c0/0x570 net/netlink/af_netlink.c:1917 [<00000000f70b15ea>] sock_sendmsg_nosec net/socket.c:639 [inline] [<00000000f70b15ea>] sock_sendmsg+0x54/0x70 net/socket.c:659 [<00000000ef95a9be>] ____sys_sendmsg+0x2d0/0x300 net/socket.c:2330 [<00000000b650f1ab>] ___sys_sendmsg+0x8a/0xd0 net/socket.c:2384 [<0000000055bfa74a>] __sys_sendmsg+0x80/0xf0 net/socket.c:2417 [<000000002abac183>] __do_sys_sendmsg net/socket.c:2426 [inline] [<000000002abac183>] __se_sys_sendmsg net/socket.c:2424 [inline] [<000000002abac183>] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2424 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot+03c4738ed29d5d366ddf@syzkaller.appspotmail.com Cc: Cong Wang Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/ematch.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index d0140a92694a..dd3b8c11a2e0 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -238,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { + err = -EINVAL; + if (em_hdr->flags & TCF_EM_SIMPLE) + goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; -- cgit v1.2.3 From 122d74fac84204b9a98263636f6f9a3b2e665639 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Jan 2020 23:08:04 +0000 Subject: rxrpc: Fix use-after-free in rxrpc_receive_data() The subpacket scanning loop in rxrpc_receive_data() references the subpacket count in the private data part of the sk_buff in the loop termination condition. However, when the final subpacket is pasted into the ring buffer, the function is no longer has a ref on the sk_buff and should not be looking at sp->* any more. This point is actually marked in the code when skb is cleared (but sp is not - which is an error). Fix this by caching sp->nr_subpackets in a local variable and using that instead. Also clear 'sp' to catch accesses after that point. This can show up as an oops in rxrpc_get_skb() if sp->nr_subpackets gets trashed by the sk_buff getting freed and reused in the meantime. Fixes: e2de6c404898 ("rxrpc: Use info in skbuff instead of reparsing a jumbo packet") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/input.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 86bd133b4fa0..96d54e5bf7bc 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -413,7 +413,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); enum rxrpc_call_state state; - unsigned int j; + unsigned int j, nr_subpackets; rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack; bool immediate_ack = false, jumbo_bad = false; @@ -457,7 +457,8 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) call->ackr_prev_seq = seq0; hard_ack = READ_ONCE(call->rx_hard_ack); - if (sp->nr_subpackets > 1) { + nr_subpackets = sp->nr_subpackets; + if (nr_subpackets > 1) { if (call->nr_jumbo_bad > 3) { ack = RXRPC_ACK_NOSPACE; ack_serial = serial; @@ -465,11 +466,11 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } } - for (j = 0; j < sp->nr_subpackets; j++) { + for (j = 0; j < nr_subpackets; j++) { rxrpc_serial_t serial = sp->hdr.serial + j; rxrpc_seq_t seq = seq0 + j; unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; - bool terminal = (j == sp->nr_subpackets - 1); + bool terminal = (j == nr_subpackets - 1); bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST); u8 flags, annotation = j; @@ -506,7 +507,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } if (call->rxtx_buffer[ix]) { - rxrpc_input_dup_data(call, seq, sp->nr_subpackets > 1, + rxrpc_input_dup_data(call, seq, nr_subpackets > 1, &jumbo_bad); if (ack != RXRPC_ACK_DUPLICATE) { ack = RXRPC_ACK_DUPLICATE; @@ -564,6 +565,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) * ring. */ skb = NULL; + sp = NULL; } if (last) { -- cgit v1.2.3 From 3b33583265ed3b0ae76eddbabf9d038b4076d1a9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sat, 25 Jan 2020 11:26:42 +0100 Subject: net: Add fraglist GRO/GSO feature flags This adds new Fraglist GRO/GSO feature flags. They will be used to configure fraglist GRO/GSO what will be implemented with some followup paches. Signed-off-by: Steffen Klassert Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 6 +++++- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 2 ++ net/ethtool/common.c | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 4b19c544c59a..b239507da2a0 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -53,8 +53,9 @@ enum { NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */ + NETIF_F_GSO_FRAGLIST_BIT, /* ... Fraglist GSO */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_UDP_L4_BIT, + NETIF_F_GSO_FRAGLIST_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -80,6 +81,7 @@ enum { NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ + NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */ /* * Add your fresh new feature above and remember to update @@ -150,6 +152,8 @@ enum { #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) #define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) +#define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST) +#define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST) /* Finds the next feature with the highest number of the range of start till 0. */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 78e9c6c1b131..fcc76b890f50 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4570,6 +4570,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 26beae7db264..23aaaf08e1e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -592,6 +592,8 @@ enum { SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, + + SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 diff --git a/net/ethtool/common.c b/net/ethtool/common.c index e621b1694d2f..c7b8956c3827 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -59,6 +59,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", + [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", }; const char -- cgit v1.2.3 From 1a3c998f3a27ab6ecf56bdbb17e27e55fd6d47cd Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sat, 25 Jan 2020 11:26:43 +0100 Subject: net: Add a netdev software feature set that defaults to off. The previous patch added the NETIF_F_GRO_FRAGLIST feature. This is a software feature that should default to off. Current software features default to on, so add a new feature set that defaults to off. Signed-off-by: Steffen Klassert Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 3 +++ net/core/dev.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index b239507da2a0..34d050bb1ae6 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -230,6 +230,9 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) /* changeable features with no special hardware requirements */ #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) +/* Changeable features with no special hardware requirements that defaults to off. */ +#define NETIF_F_SOFT_FEATURES_OFF NETIF_F_GRO_FRAGLIST + #define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | \ diff --git a/net/core/dev.c b/net/core/dev.c index c806b078097b..a3b154a4b4f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9283,7 +9283,7 @@ int register_netdevice(struct net_device *dev) /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ - dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->netdev_ops->ndo_udp_tunnel_add) { -- cgit v1.2.3 From 3a1296a38d0cf62bffb9a03c585cbd5dbf15d596 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sat, 25 Jan 2020 11:26:44 +0100 Subject: net: Support GRO/GSO fraglist chaining. This patch adds the core functions to chain/unchain GSO skbs at the frag_list pointer. This also adds a new GSO type SKB_GSO_FRAGLIST and a is_flist flag to napi_gro_cb which indicates that this flow will be GROed by fraglist chaining. Signed-off-by: Steffen Klassert Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++- include/linux/skbuff.h | 2 ++ net/core/dev.c | 2 +- net/core/skbuff.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fcc76b890f50..20445f94eb1c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2326,7 +2326,8 @@ struct napi_gro_cb { /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; - /* 1 bit hole */ + /* GRO is done by frag_list pointer chaining. */ + u8 is_flist:1; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -2694,6 +2695,7 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 23aaaf08e1e9..3d13a4b717e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3535,6 +3535,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); +struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, + unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); diff --git a/net/core/dev.c b/net/core/dev.c index a3b154a4b4f9..ce8900dbd9ea 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3249,7 +3249,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 48a7029529c9..864cb9e9622f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3639,6 +3639,97 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) return head_frag; } +struct sk_buff *skb_segment_list(struct sk_buff *skb, + netdev_features_t features, + unsigned int offset) +{ + struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; + unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int delta_truesize = 0; + unsigned int delta_len = 0; + struct sk_buff *tail = NULL; + struct sk_buff *nskb; + + skb_push(skb, -skb_network_offset(skb) + offset); + + skb_shinfo(skb)->frag_list = NULL; + + do { + nskb = list_skb; + list_skb = list_skb->next; + + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + tail = nskb; + + delta_len += nskb->len; + delta_truesize += nskb->truesize; + + skb_push(nskb, -skb_network_offset(nskb) + offset); + + __copy_skb_header(nskb, skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + offset + tnl_hlen); + + if (skb_needs_linearize(nskb, features) && + __skb_linearize(nskb)) + goto err_linearize; + + } while (list_skb); + + skb->truesize = skb->truesize - delta_truesize; + skb->data_len = skb->data_len - delta_len; + skb->len = skb->len - delta_len; + + skb_gso_reset(skb); + + skb->prev = tail; + + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto err_linearize; + + skb_get(skb); + + return skb; + +err_linearize: + kfree_skb_list(skb->next); + skb->next = NULL; + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(skb_segment_list); + +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive_list); + /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment -- cgit v1.2.3 From 9fd1ff5d2ac7181844735806b0a703c942365291 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sat, 25 Jan 2020 11:26:45 +0100 Subject: udp: Support UDP fraglist GRO/GSO. This patch extends UDP GRO to support fraglist GRO/GSO by using the previously introduced infrastructure. If the feature is enabled, all UDP packets are going to fraglist GRO (local input and forward). After validating the csum, we mark ip_summed as CHECKSUM_UNNECESSARY for fraglist GRO packets to make sure that the csum is not touched. Signed-off-by: Steffen Klassert Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 2 +- net/ipv4/udp_offload.c | 104 ++++++++++++++++++++++++++++++++++++++----------- net/ipv6/udp_offload.c | 27 ++++++++++++- 3 files changed, 107 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index bad74f780831..44e0e52b585c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -167,7 +167,7 @@ typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup); + struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b25e42100ceb..1a98583a79f4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -184,6 +184,20 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); +static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + unsigned int mss = skb_shinfo(skb)->gso_size; + + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); + + return skb; +} + struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features) { @@ -196,6 +210,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __sum16 check; __be16 newlen; + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) + return __udp_gso_segment_list(gso_skb, features); + mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); @@ -354,6 +371,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct udphdr *uh2; struct sk_buff *p; unsigned int ulen; + int ret = 0; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -369,7 +387,6 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); - skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) @@ -383,14 +400,40 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, continue; } + if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) { + NAPI_GRO_CB(skb)->flush = 1; + return p; + } + /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) || - ulen != ntohs(uh2->len) || + if (ulen > ntohs(uh2->len)) { + pp = p; + } else { + if (NAPI_GRO_CB(skb)->is_flist) { + if (!pskb_may_pull(skb, skb_gro_offset(skb))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + if ((skb->ip_summed != p->ip_summed) || + (skb->csum_level != p->csum_level)) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + ret = skb_gro_receive_list(p, skb); + } else { + skb_gro_postpull_rcsum(skb, uh, + sizeof(struct udphdr)); + + ret = skb_gro_receive(p, skb); + } + } + + if (ret || ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; @@ -401,36 +444,29 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } -INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, - __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) + struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; - struct sock *sk; - rcu_read_lock(); - sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, - udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (!sk) - goto out_unlock; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; - if (udp_sk(sk)->gro_enabled) { + if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { pp = call_gro_receive(udp_gro_receive_segment, head, skb); - rcu_read_unlock(); return pp; } - if (NAPI_GRO_CB(skb)->encap_mark || + if (!sk || NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid) || !udp_sk(sk)->gro_receive) - goto out_unlock; + goto out; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; @@ -457,8 +493,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); -out_unlock: - rcu_read_unlock(); +out: skb_gro_flush_final(skb, pp, flush); return pp; } @@ -468,8 +503,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -484,7 +521,11 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -517,9 +558,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, rcu_read_lock(); sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_enabled) { - err = udp_gro_complete_segment(skb); - } else if (sk && udp_sk(sk)->gro_complete) { + if (sk && udp_sk(sk)->gro_complete) { skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -529,6 +568,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } else { + err = udp_gro_complete_segment(skb); } rcu_read_unlock(); @@ -544,6 +585,23 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index f0d5fc27d0b5..584157a07759 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -115,8 +115,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -132,7 +134,11 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; - return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -144,6 +150,23 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, &ipv6h->daddr, 0); -- cgit v1.2.3 From cd94ef06392ffd49e0a0e1c28bc5cd44f37f1f6b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 25 Jan 2020 10:41:02 +0000 Subject: soreuseport: Cleanup duplicate initialization of more_reuse->max_socks. reuseport_grow() does not need to initialize the more_reuse->max_socks again. It is already initialized in __reuseport_alloc(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index f19f179538b9..91e9f2223c39 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -107,7 +107,6 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) if (!more_reuse) return NULL; - more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; -- cgit v1.2.3 From 48b3a1379fc6603c1ff26893ea05322c1c41e31c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 25 Jan 2020 12:17:06 +0100 Subject: net: call call_netdevice_unregister_net_notifiers from unregister The function does the same thing as the existing code, so rather call call_netdevice_unregister_net_notifiers() instead of code duplication. Signed-off-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index ce8900dbd9ea..ee4b1e64d663 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1764,7 +1764,6 @@ EXPORT_SYMBOL(register_netdevice_notifier); int unregister_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; struct net *net; int err; @@ -1775,16 +1774,9 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net(net) + call_netdevice_unregister_net_notifiers(nb, net); + unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); -- cgit v1.2.3 From 1f637703d8b63f1ba411b4c798e998e3f828b6cb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 25 Jan 2020 12:17:07 +0100 Subject: net: push code from net notifier reg/unreg into helpers Push the code which is done under rtnl lock in net notifier register and unregister function into separate helpers. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 60 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index ee4b1e64d663..b521b509a653 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1784,6 +1784,42 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +static int __register_netdevice_notifier_net(struct net *net, + struct notifier_block *nb, + bool ignore_call_fail) +{ + int err; + + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + return err; + if (dev_boot_phase) + return 0; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err && !ignore_call_fail) + goto chain_unregister; + + return 0; + +chain_unregister: + raw_notifier_chain_unregister(&net->netdev_chain, nb); + return err; +} + +static int __unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + return err; + + call_netdevice_unregister_net_notifiers(nb, net); + return 0; +} + /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace @@ -1804,23 +1840,9 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) int err; rtnl_lock(); - err = raw_notifier_chain_register(&net->netdev_chain, nb); - if (err) - goto unlock; - if (dev_boot_phase) - goto unlock; - - err = call_netdevice_register_net_notifiers(nb, net); - if (err) - goto chain_unregister; - -unlock: + err = __register_netdevice_notifier_net(net, nb, false); rtnl_unlock(); return err; - -chain_unregister: - raw_notifier_chain_unregister(&netdev_chain, nb); - goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier_net); @@ -1846,13 +1868,7 @@ int unregister_netdevice_notifier_net(struct net *net, int err; rtnl_lock(); - err = raw_notifier_chain_unregister(&net->netdev_chain, nb); - if (err) - goto unlock; - - call_netdevice_unregister_net_notifiers(nb, net); - -unlock: + err = __unregister_netdevice_notifier_net(net, nb); rtnl_unlock(); return err; } -- cgit v1.2.3 From 93642e14bd50e59b11cf6389ce3fc243e932777a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 25 Jan 2020 12:17:08 +0100 Subject: net: introduce dev_net notifier register/unregister variants Introduce dev_net variants of netdev notifier register/unregister functions and allow per-net notifier to follow the netdevice into the namespace it is moved to. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 +++++++++++++++++ net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20445f94eb1c..4626188a754b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -939,6 +939,11 @@ struct netdev_name_node { int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); +struct netdev_net_notifier { + struct list_head list; + struct notifier_block *nb; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1793,6 +1798,10 @@ enum netdev_priv_flags { * * @wol_enabled: Wake-on-LAN is enabled * + * @net_notifier_list: List of per-net netdev notifier block + * that follow this device when it is moved + * to another network namespace. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2085,6 +2094,8 @@ struct net_device { struct lock_class_key addr_list_lock_key; bool proto_down; unsigned wol_enabled:1; + + struct list_head net_notifier_list; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2529,6 +2540,12 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn); +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn); struct netdev_notifier_info { struct net_device *dev; diff --git a/net/core/dev.c b/net/core/dev.c index b521b509a653..38bc35da39f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1874,6 +1874,48 @@ int unregister_netdevice_notifier_net(struct net *net, } EXPORT_SYMBOL(unregister_netdevice_notifier_net); +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + if (!err) { + nn->nb = nb; + list_add(&nn->list, &dev->net_notifier_list); + } + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_dev_net); + +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + list_del(&nn->list); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); + +static void move_netdevice_notifiers_dev_net(struct net_device *dev, + struct net *net) +{ + struct netdev_net_notifier *nn; + + list_for_each_entry(nn, &dev->net_notifier_list, list) { + __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); + __register_netdevice_notifier_net(net, nn->nb, true); + } +} + /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -9786,6 +9828,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); + INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif @@ -10049,6 +10092,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); + /* Move per-net netdevice notifiers that are following the netdevice */ + move_netdevice_notifiers_dev_net(dev, net); + /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; -- cgit v1.2.3 From a85dd3a5170c8812cd835ea968ccadf0ebf1648e Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 25 Jan 2020 13:42:14 +0100 Subject: net: remove eth_change_mtu All usage of this function was removed three years ago, and the function was marked as deprecated: a52ad514fdf3 ("net: deprecate eth_change_mtu, remove usage") So I think we can remove it now. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 1 - net/ethernet/eth.c | 16 ---------------- 2 files changed, 17 deletions(-) (limited to 'net') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index f6564b572d77..8801f1f986e5 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -43,7 +43,6 @@ __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); -int eth_change_mtu(struct net_device *dev, int new_mtu); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9040fe55e0f5..c8b903302ff2 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -335,22 +335,6 @@ int eth_mac_addr(struct net_device *dev, void *p) } EXPORT_SYMBOL(eth_mac_addr); -/** - * eth_change_mtu - set new MTU size - * @dev: network device - * @new_mtu: new Maximum Transfer Unit - * - * Allow changing MTU size. Needs to be overridden for devices - * supporting jumbo frames. - */ -int eth_change_mtu(struct net_device *dev, int new_mtu) -{ - netdev_warn(dev, "%s is deprecated\n", __func__); - dev->mtu = new_mtu; - return 0; -} -EXPORT_SYMBOL(eth_change_mtu); - int eth_validate_addr(struct net_device *dev) { if (!is_valid_ether_addr(dev->dev_addr)) -- cgit v1.2.3 From 6dc43cd3aae0ffeeaebe427e1bf5e5faf9de7d42 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 25 Jan 2020 23:01:11 +0200 Subject: net: dsa: Fix use-after-free in probing of DSA switch tree DSA sets up a switch tree little by little. Every switch of the N members of the tree calls dsa_register_switch, and (N - 1) will just touch the dst->ports list with their ports and quickly exit. Only the last switch that calls dsa_register_switch will find all DSA links complete in dsa_tree_setup_routing_table, and not return zero as a result but instead go ahead and set up the entire DSA switch tree (practically on behalf of the other switches too). The trouble is that the (N - 1) switches don't clean up after themselves after they get an error such as EPROBE_DEFER. Their footprint left in dst->ports by dsa_switch_touch_ports is still there. And switch N, the one responsible with actually setting up the tree, is going to work with those stale dp, dp->ds and dp->ds->dev pointers. In particular ds and ds->dev might get freed by the device driver. Be there a 2-switch tree and the following calling order: - Switch 1 calls dsa_register_switch - Calls dsa_switch_touch_ports, populates dst->ports - Calls dsa_port_parse_cpu, gets -EPROBE_DEFER, exits. - Switch 2 calls dsa_register_switch - Calls dsa_switch_touch_ports, populates dst->ports - Probe doesn't get deferred, so it goes ahead. - Calls dsa_tree_setup_routing_table, which returns "complete == true" due to Switch 1 having called dsa_switch_touch_ports before. - Because the DSA links are complete, it calls dsa_tree_setup_switches now. - dsa_tree_setup_switches iterates through dst->ports, initializing the Switch 1 ds structure (invalid) and the Switch 2 ds structure (valid). - Undefined behavior (use after free, sometimes NULL pointers, etc). Real example below (debugging prints added by me, as well as guards against NULL pointers): [ 5.477947] dsa_tree_setup_switches: Setting up port 0 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.313002] dsa_tree_setup_switches: Setting up port 1 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.319932] dsa_tree_setup_switches: Setting up port 2 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.329693] dsa_tree_setup_switches: Setting up port 3 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.339458] dsa_tree_setup_switches: Setting up port 4 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.349226] dsa_tree_setup_switches: Setting up port 5 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.358991] dsa_tree_setup_switches: Setting up port 6 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.368758] dsa_tree_setup_switches: Setting up port 7 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.378524] dsa_tree_setup_switches: Setting up port 8 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.388291] dsa_tree_setup_switches: Setting up port 9 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.398057] dsa_tree_setup_switches: Setting up port 10 of switch ffffff803df0b980 (dev ffffff803f775c00) [ 6.407912] dsa_tree_setup_switches: Setting up port 0 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.417682] dsa_tree_setup_switches: Setting up port 1 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.427446] dsa_tree_setup_switches: Setting up port 2 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.437212] dsa_tree_setup_switches: Setting up port 3 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.446979] dsa_tree_setup_switches: Setting up port 4 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.456744] dsa_tree_setup_switches: Setting up port 5 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.466512] dsa_tree_setup_switches: Setting up port 6 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.476277] dsa_tree_setup_switches: Setting up port 7 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.486043] dsa_tree_setup_switches: Setting up port 8 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.495810] dsa_tree_setup_switches: Setting up port 9 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.505577] dsa_tree_setup_switches: Setting up port 10 of switch ffffff803da02f80 (dev 0000000000000000) [ 6.515433] dsa_tree_setup_switches: Setting up port 0 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.354120] dsa_tree_setup_switches: Setting up port 1 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.361045] dsa_tree_setup_switches: Setting up port 2 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.370805] dsa_tree_setup_switches: Setting up port 3 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.380571] dsa_tree_setup_switches: Setting up port 4 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.390337] dsa_tree_setup_switches: Setting up port 5 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.400104] dsa_tree_setup_switches: Setting up port 6 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.409872] dsa_tree_setup_switches: Setting up port 7 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.419637] dsa_tree_setup_switches: Setting up port 8 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.429403] dsa_tree_setup_switches: Setting up port 9 of switch ffffff803db15b80 (dev ffffff803d8e4800) [ 7.439169] dsa_tree_setup_switches: Setting up port 10 of switch ffffff803db15b80 (dev ffffff803d8e4800) The solution is to recognize that the functions that call dsa_switch_touch_ports (dsa_switch_parse_of, dsa_switch_parse) have side effects, and therefore one should clean up their side effects on error path. The cleanup of dst->ports was taken from dsa_switch_remove and moved into a dedicated dsa_switch_release_ports function, which should really be per-switch (free only the members of dst->ports that are also members of ds, instead of all switch ports). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c6d81f2baf4e..e7c30b472034 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -849,6 +849,19 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) return dsa_switch_parse_ports(ds, cd); } +static void dsa_switch_release_ports(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp, *next; + + list_for_each_entry_safe(dp, next, &dst->ports, list) { + if (dp->ds != ds) + continue; + list_del(&dp->list); + kfree(dp); + } +} + static int dsa_switch_probe(struct dsa_switch *ds) { struct dsa_switch_tree *dst; @@ -865,12 +878,17 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (!ds->num_ports) return -EINVAL; - if (np) + if (np) { err = dsa_switch_parse_of(ds, np); - else if (pdata) + if (err) + dsa_switch_release_ports(ds); + } else if (pdata) { err = dsa_switch_parse(ds, pdata); - else + if (err) + dsa_switch_release_ports(ds); + } else { err = -ENODEV; + } if (err) return err; @@ -878,8 +896,10 @@ static int dsa_switch_probe(struct dsa_switch *ds) dst = ds->dst; dsa_tree_get(dst); err = dsa_tree_setup(dst); - if (err) + if (err) { + dsa_switch_release_ports(ds); dsa_tree_put(dst); + } return err; } @@ -900,15 +920,9 @@ EXPORT_SYMBOL_GPL(dsa_register_switch); static void dsa_switch_remove(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *dp, *next; dsa_tree_teardown(dst); - - list_for_each_entry_safe(dp, next, &dst->ports, list) { - list_del(&dp->list); - kfree(dp); - } - + dsa_switch_release_ports(ds); dsa_tree_put(dst); } -- cgit v1.2.3 From 59fb9b62fb6c929a756563152a89f39b07cf8893 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Fri, 17 Jan 2020 16:05:32 +0900 Subject: flow_dissector: Fix to use new variables for port ranges in bpf hook This patch applies new flag (FLOW_DISSECTOR_KEY_PORTS_RANGE) and field (tp_range) to BPF flow dissector to generate appropriate flow keys when classified by specified port ranges. Fixes: 8ffb055beae5 ("cls_flower: Fix the behavior using port ranges with hw-offload") Signed-off-by: Yoshiki Komachi Signed-off-by: Daniel Borkmann Acked-by: Petar Penkov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200117070533.402240-2-komachi.yoshiki@gmail.com --- net/core/flow_dissector.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index f560b4902060..a1670dff0629 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -834,10 +834,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; - struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, @@ -876,10 +876,17 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); + else if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } -- cgit v1.2.3 From d2c4b444fd13f4699ab8c4c49062cd2c7516c241 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:01 +0100 Subject: ethtool: fix kernel-doc descriptions Fix missing or incorrect function argument and struct member descriptions. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 26 ++++++++++++++------------ net/ethtool/strset.c | 1 + 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 86b79f9bc08d..0af43bbdb9b2 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -134,11 +134,12 @@ nla_put_failure: /** * ethnl_reply_init() - Create skb for a reply and fill device identification - * @payload: payload length (without netlink and genetlink header) - * @dev: device the reply is about (may be null) - * @cmd: ETHTOOL_MSG_* message type for reply - * @info: genetlink info of the received packet we respond to - * @ehdrp: place to store payload pointer returned by genlmsg_new() + * @payload: payload length (without netlink and genetlink header) + * @dev: device the reply is about (may be null) + * @cmd: ETHTOOL_MSG_* message type for reply + * @hdr_attrtype: attribute type for common header + * @info: genetlink info of the received packet we respond to + * @ehdrp: place to store payload pointer returned by genlmsg_new() * * Return: pointer to allocated skb on success, NULL on error */ @@ -188,10 +189,11 @@ static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) /** * struct ethnl_dump_ctx - context structure for generic dumpit() callback - * @ops: request ops of currently processed message type - * @req_info: parsed request header of processed request - * @pos_hash: saved iteration position - hashbucket - * @pos_idx: saved iteration position - index + * @ops: request ops of currently processed message type + * @req_info: parsed request header of processed request + * @reply_data: data needed to compose the reply + * @pos_hash: saved iteration position - hashbucket + * @pos_idx: saved iteration position - index * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used @@ -268,9 +270,9 @@ out: /** * ethnl_init_reply_data() - Initialize reply data for GET request - * @req_info: pointer to embedded struct ethnl_req_info - * @ops: instance of struct ethnl_request_ops describing the layout - * @dev: network device to initialize the reply for + * @reply_data: pointer to embedded struct ethnl_reply_data + * @ops: instance of struct ethnl_request_ops describing the layout + * @dev: network device to initialize the reply for * * Fills the reply data part with zeros and sets the dev member. Must be called * before calling the ->fill_reply() callback (for each iteration when handling diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 82a059c13c1c..948d967f1eca 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -85,6 +85,7 @@ get_stringset_policy[ETHTOOL_A_STRINGSET_MAX + 1] = { /** * strset_include() - test if a string set should be included in reply + * @info: parsed client request * @data: pointer to request data structure * @id: id of string set to check (ETH_SS_* constants) */ -- cgit v1.2.3 From 6a94b8ccf6b77f005ab1b36a878e1d81df0c033e Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:04 +0100 Subject: ethtool: provide message mask with DEBUG_GET request Implement DEBUG_GET request to get debugging settings for a device. At the moment, only message mask corresponding to message level as reported by ETHTOOL_GMSGLVL ioctl request is provided. (It is called message level in ioctl interface but almost all drivers interpret it as a bit mask.) As part of the implementation, provide symbolic names for message mask bits as ETH_SS_MSG_CLASSES string set. Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 34 +++++++++++- include/linux/netdevice.h | 56 +++++++++++++------ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 14 +++++ net/ethtool/Makefile | 2 +- net/ethtool/common.c | 19 +++++++ net/ethtool/common.h | 1 + net/ethtool/debug.c | 80 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 8 +++ net/ethtool/netlink.h | 1 + net/ethtool/strset.c | 5 ++ 11 files changed, 205 insertions(+), 17 deletions(-) create mode 100644 net/ethtool/debug.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index c60afba69e3c..76a411d3102d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -185,6 +185,7 @@ Userspace to kernel: ``ETHTOOL_MSG_LINKMODES_GET`` get link modes info ``ETHTOOL_MSG_LINKMODES_SET`` set link modes info ``ETHTOOL_MSG_LINKSTATE_GET`` get link state + ``ETHTOOL_MSG_DEBUG_GET`` get debugging settings ===================================== ================================ Kernel to userspace: @@ -196,6 +197,7 @@ Kernel to userspace: ``ETHTOOL_MSG_LINKMODES_GET_REPLY`` link modes info ``ETHTOOL_MSG_LINKMODES_NTF`` link modes notification ``ETHTOOL_MSG_LINKSTATE_GET_REPLY`` link state info + ``ETHTOOL_MSG_DEBUG_GET_REPLY`` debugging settings ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device @@ -423,6 +425,36 @@ define their own handler. devices supporting the request). +DEBUG_GET +========= + +Requests debugging settings of a device. At the moment, only message mask is +provided. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_DEBUG_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_DEBUG_HEADER`` nested reply header + ``ETHTOOL_A_DEBUG_MSGMASK`` bitset message mask + ==================================== ====== ========================== + +The message mask (``ETHTOOL_A_DEBUG_MSGMASK``) is equal to message level as +provided by ``ETHTOOL_GMSGLVL`` and set by ``ETHTOOL_SMSGLVL`` in ioctl +interface. While it is called message level there for historical reasons, most +drivers and almost all newer drivers use it as a mask of enabled message +classes (represented by ``NETIF_MSG_*`` constants); therefore netlink +interface follows its actual use in practice. + +``DEBUG_GET`` allows dump requests (kernel returns reply messages for all +devices supporting the request). + + Request translation =================== @@ -441,7 +473,7 @@ have their netlink replacement yet. ``ETHTOOL_GREGS`` n/a ``ETHTOOL_GWOL`` n/a ``ETHTOOL_SWOL`` n/a - ``ETHTOOL_GMSGLVL`` n/a + ``ETHTOOL_GMSGLVL`` ``ETHTOOL_MSG_DEBUG_GET`` ``ETHTOOL_SMSGLVL`` n/a ``ETHTOOL_NWAY_RST`` n/a ``ETHTOOL_GLINK`` ``ETHTOOL_MSG_LINKSTATE_GET`` diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4626188a754b..a9c6b5c61d27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3913,22 +3913,48 @@ void netif_device_attach(struct net_device *dev); */ enum { - NETIF_MSG_DRV = 0x0001, - NETIF_MSG_PROBE = 0x0002, - NETIF_MSG_LINK = 0x0004, - NETIF_MSG_TIMER = 0x0008, - NETIF_MSG_IFDOWN = 0x0010, - NETIF_MSG_IFUP = 0x0020, - NETIF_MSG_RX_ERR = 0x0040, - NETIF_MSG_TX_ERR = 0x0080, - NETIF_MSG_TX_QUEUED = 0x0100, - NETIF_MSG_INTR = 0x0200, - NETIF_MSG_TX_DONE = 0x0400, - NETIF_MSG_RX_STATUS = 0x0800, - NETIF_MSG_PKTDATA = 0x1000, - NETIF_MSG_HW = 0x2000, - NETIF_MSG_WOL = 0x4000, + NETIF_MSG_DRV_BIT, + NETIF_MSG_PROBE_BIT, + NETIF_MSG_LINK_BIT, + NETIF_MSG_TIMER_BIT, + NETIF_MSG_IFDOWN_BIT, + NETIF_MSG_IFUP_BIT, + NETIF_MSG_RX_ERR_BIT, + NETIF_MSG_TX_ERR_BIT, + NETIF_MSG_TX_QUEUED_BIT, + NETIF_MSG_INTR_BIT, + NETIF_MSG_TX_DONE_BIT, + NETIF_MSG_RX_STATUS_BIT, + NETIF_MSG_PKTDATA_BIT, + NETIF_MSG_HW_BIT, + NETIF_MSG_WOL_BIT, + + /* When you add a new bit above, update netif_msg_class_names array + * in net/ethtool/common.c + */ + NETIF_MSG_CLASS_COUNT, }; +/* Both ethtool_ops interface and internal driver implementation use u32 */ +static_assert(NETIF_MSG_CLASS_COUNT <= 32); + +#define __NETIF_MSG_BIT(bit) ((u32)1 << (bit)) +#define __NETIF_MSG(name) __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT) + +#define NETIF_MSG_DRV __NETIF_MSG(DRV) +#define NETIF_MSG_PROBE __NETIF_MSG(PROBE) +#define NETIF_MSG_LINK __NETIF_MSG(LINK) +#define NETIF_MSG_TIMER __NETIF_MSG(TIMER) +#define NETIF_MSG_IFDOWN __NETIF_MSG(IFDOWN) +#define NETIF_MSG_IFUP __NETIF_MSG(IFUP) +#define NETIF_MSG_RX_ERR __NETIF_MSG(RX_ERR) +#define NETIF_MSG_TX_ERR __NETIF_MSG(TX_ERR) +#define NETIF_MSG_TX_QUEUED __NETIF_MSG(TX_QUEUED) +#define NETIF_MSG_INTR __NETIF_MSG(INTR) +#define NETIF_MSG_TX_DONE __NETIF_MSG(TX_DONE) +#define NETIF_MSG_RX_STATUS __NETIF_MSG(RX_STATUS) +#define NETIF_MSG_PKTDATA __NETIF_MSG(PKTDATA) +#define NETIF_MSG_HW __NETIF_MSG(HW) +#define NETIF_MSG_WOL __NETIF_MSG(WOL) #define netif_msg_drv(p) ((p)->msg_enable & NETIF_MSG_DRV) #define netif_msg_probe(p) ((p)->msg_enable & NETIF_MSG_PROBE) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 116bcbf09c74..456fb2aa0fad 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -594,6 +594,7 @@ struct ethtool_pauseparam { * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS * @ETH_SS_PHY_TUNABLES: PHY tunable names * @ETH_SS_LINK_MODES: link mode names + * @ETH_SS_MSG_CLASSES: debug message class names */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -606,6 +607,7 @@ enum ethtool_stringset { ETH_SS_PHY_STATS, ETH_SS_PHY_TUNABLES, ETH_SS_LINK_MODES, + ETH_SS_MSG_CLASSES, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 02f82f42a889..0d39e04567cb 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -20,6 +20,7 @@ enum { ETHTOOL_MSG_LINKMODES_GET, ETHTOOL_MSG_LINKMODES_SET, ETHTOOL_MSG_LINKSTATE_GET, + ETHTOOL_MSG_DEBUG_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -35,6 +36,7 @@ enum { ETHTOOL_MSG_LINKMODES_GET_REPLY, ETHTOOL_MSG_LINKMODES_NTF, ETHTOOL_MSG_LINKSTATE_GET_REPLY, + ETHTOOL_MSG_DEBUG_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -195,6 +197,18 @@ enum { ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 }; +/* DEBUG */ + +enum { + ETHTOOL_A_DEBUG_UNSPEC, + ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_DEBUG_CNT, + ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 9a1332fb0cc6..c120c820a4f5 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -5,4 +5,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ - linkstate.o + linkstate.o debug.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index c7b8956c3827..aa1183a65a76 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -171,6 +171,25 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); +const char netif_msg_class_names[][ETH_GSTRING_LEN] = { + [NETIF_MSG_DRV_BIT] = "drv", + [NETIF_MSG_PROBE_BIT] = "probe", + [NETIF_MSG_LINK_BIT] = "link", + [NETIF_MSG_TIMER_BIT] = "timer", + [NETIF_MSG_IFDOWN_BIT] = "ifdown", + [NETIF_MSG_IFUP_BIT] = "ifup", + [NETIF_MSG_RX_ERR_BIT] = "rx_err", + [NETIF_MSG_TX_ERR_BIT] = "tx_err", + [NETIF_MSG_TX_QUEUED_BIT] = "tx_queued", + [NETIF_MSG_INTR_BIT] = "intr", + [NETIF_MSG_TX_DONE_BIT] = "tx_done", + [NETIF_MSG_RX_STATUS_BIT] = "rx_status", + [NETIF_MSG_PKTDATA_BIT] = "pktdata", + [NETIF_MSG_HW_BIT] = "hw", + [NETIF_MSG_WOL_BIT] = "wol", +}; +static_assert(ARRAY_SIZE(netif_msg_class_names) == NETIF_MSG_CLASS_COUNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 5c5f7dc90cd4..064c5c3aa990 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -19,6 +19,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; +extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c new file mode 100644 index 000000000000..cc121a23be5f --- /dev/null +++ b/net/ethtool/debug.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct debug_req_info { + struct ethnl_req_info base; +}; + +struct debug_reply_data { + struct ethnl_reply_data base; + u32 msg_mask; +}; + +#define DEBUG_REPDATA(__reply_base) \ + container_of(__reply_base, struct debug_reply_data, base) + +static const struct nla_policy +debug_get_policy[ETHTOOL_A_DEBUG_MAX + 1] = { + [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_REJECT }, +}; + +static int debug_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct debug_reply_data *data = DEBUG_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_msglevel) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->msg_mask = dev->ethtool_ops->get_msglevel(dev); + ethnl_ops_complete(dev); + + return 0; +} + +static int debug_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + + return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, + netif_msg_class_names, compact); +} + +static int debug_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + + return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, + NULL, NETIF_MSG_CLASS_COUNT, + netif_msg_class_names, compact); +} + +const struct ethnl_request_ops ethnl_debug_request_ops = { + .request_cmd = ETHTOOL_MSG_DEBUG_GET, + .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, + .hdr_attr = ETHTOOL_A_DEBUG_HEADER, + .max_attr = ETHTOOL_A_DEBUG_MAX, + .req_info_size = sizeof(struct debug_req_info), + .reply_data_size = sizeof(struct debug_reply_data), + .request_policy = debug_get_policy, + + .prepare_data = debug_prepare_data, + .reply_size = debug_reply_size, + .fill_reply = debug_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0af43bbdb9b2..bdaf583e392b 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -213,6 +213,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, + [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -664,6 +665,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_DEBUG_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index da9d6521a4eb..9bd8ef671501 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -334,6 +334,7 @@ extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; +extern const struct ethnl_request_ops ethnl_debug_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 948d967f1eca..7a45c25355b8 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -50,6 +50,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_LINK_MODE_MASK_NBITS, .strings = link_mode_names, }, + [ETH_SS_MSG_CLASSES] = { + .per_dev = false, + .count = NETIF_MSG_CLASS_COUNT, + .strings = netif_msg_class_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From e54d04e3afead22d8e7d6edaaac487a1205bac39 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:07 +0100 Subject: ethtool: set message mask with DEBUG_SET request Implement DEBUG_SET netlink request to set debugging settings for a device. At the moment, only message mask corresponding to message level as set by ETHTOOL_SMSGLVL ioctl request can be set. (It is called message level in ioctl interface but almost all drivers interpret it as a bit mask.) Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 20 ++++++++++- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/debug.c | 53 ++++++++++++++++++++++++++++ net/ethtool/netlink.c | 5 +++ net/ethtool/netlink.h | 1 + 5 files changed, 79 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 76a411d3102d..58473c05319f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -186,6 +186,7 @@ Userspace to kernel: ``ETHTOOL_MSG_LINKMODES_SET`` set link modes info ``ETHTOOL_MSG_LINKSTATE_GET`` get link state ``ETHTOOL_MSG_DEBUG_GET`` get debugging settings + ``ETHTOOL_MSG_DEBUG_SET`` set debugging settings ===================================== ================================ Kernel to userspace: @@ -455,6 +456,23 @@ interface follows its actual use in practice. devices supporting the request). +DEBUG_SET +========= + +Set or update debugging settings of a device. At the moment, only message mask +is supported. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_DEBUG_HEADER`` nested request header + ``ETHTOOL_A_DEBUG_MSGMASK`` bitset message mask + ==================================== ====== ========================== + +``ETHTOOL_A_DEBUG_MSGMASK`` bit set allows setting or modifying mask of +enabled debugging message types for the device. + + Request translation =================== @@ -474,7 +492,7 @@ have their netlink replacement yet. ``ETHTOOL_GWOL`` n/a ``ETHTOOL_SWOL`` n/a ``ETHTOOL_GMSGLVL`` ``ETHTOOL_MSG_DEBUG_GET`` - ``ETHTOOL_SMSGLVL`` n/a + ``ETHTOOL_SMSGLVL`` ``ETHTOOL_MSG_DEBUG_SET`` ``ETHTOOL_NWAY_RST`` n/a ``ETHTOOL_GLINK`` ``ETHTOOL_MSG_LINKSTATE_GET`` ``ETHTOOL_GEEPROM`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 0d39e04567cb..8f0b6fd277d5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -21,6 +21,7 @@ enum { ETHTOOL_MSG_LINKMODES_SET, ETHTOOL_MSG_LINKSTATE_GET, ETHTOOL_MSG_DEBUG_GET, + ETHTOOL_MSG_DEBUG_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index cc121a23be5f..6fc3ef8113c0 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -78,3 +78,56 @@ const struct ethnl_request_ops ethnl_debug_request_ops = { .reply_size = debug_reply_size, .fill_reply = debug_fill_reply, }; + +/* DEBUG_SET */ + +static const struct nla_policy +debug_set_policy[ETHTOOL_A_DEBUG_MAX + 1] = { + [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, +}; + +int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_DEBUG_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + u32 msg_mask; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_DEBUG_MAX, debug_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_DEBUG_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + msg_mask = dev->ethtool_ops->get_msglevel(dev); + ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, + tb[ETHTOOL_A_DEBUG_MSGMASK], + netif_msg_class_names, info->extack, &mod); + if (ret < 0 || !mod) + goto out_ops; + + dev->ethtool_ops->set_msglevel(dev, msg_mask); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index bdaf583e392b..bcdc42e0bd14 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -672,6 +672,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_DEBUG_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_debug, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9bd8ef671501..772723e536e8 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -338,5 +338,6 @@ extern const struct ethnl_request_ops ethnl_debug_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 0bda7af39d2ba6ef83bd18e26bcb027f1630004e Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:10 +0100 Subject: ethtool: add DEBUG_NTF notification Send ETHTOOL_MSG_DEBUG_NTF notification message whenever debugging message mask for a device are modified using ETHTOOL_MSG_DEBUG_SET netlink message or ETHTOOL_SMSGLVL ioctl request. The notification message has the same format as reply to DEBUG_GET request. As with other ethtool notifications, netlink requests only trigger the notification if the mask is actually changed while ioctl request trigger it whenever the request results in calling the ethtool_ops handler. Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 1 + include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/debug.c | 1 + net/ethtool/ioctl.c | 2 ++ net/ethtool/netlink.c | 2 ++ 5 files changed, 7 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 58473c05319f..2263f885d18f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -199,6 +199,7 @@ Kernel to userspace: ``ETHTOOL_MSG_LINKMODES_NTF`` link modes notification ``ETHTOOL_MSG_LINKSTATE_GET_REPLY`` link state info ``ETHTOOL_MSG_DEBUG_GET_REPLY`` debugging settings + ``ETHTOOL_MSG_DEBUG_NTF`` debugging settings notification ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 8f0b6fd277d5..67a06b94bf28 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -38,6 +38,7 @@ enum { ETHTOOL_MSG_LINKMODES_NTF, ETHTOOL_MSG_LINKSTATE_GET_REPLY, ETHTOOL_MSG_DEBUG_GET_REPLY, + ETHTOOL_MSG_DEBUG_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index 6fc3ef8113c0..aaef4843e6ba 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -123,6 +123,7 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) goto out_ops; dev->ethtool_ops->set_msglevel(dev, msg_mask); + ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); out_ops: ethnl_ops_complete(dev); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 182bffbffa78..46e0b31782fc 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2545,6 +2545,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SMSGLVL: rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); + if (!rc) + ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index bcdc42e0bd14..9a0a6c2f8dbb 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -524,6 +524,7 @@ static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, + [ETHTOOL_MSG_DEBUG_NTF] = ðnl_debug_request_ops, }; /* default notification handler */ @@ -607,6 +608,7 @@ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) -- cgit v1.2.3 From 51ea22b04ea0c210f9ce87b8a600965dbe476bc2 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:13 +0100 Subject: ethtool: provide WoL settings with WOL_GET request Implement WOL_GET request to get wake-on-lan settings for a device, traditionally available via ETHTOOL_GWOL ioctl request. As part of the implementation, provide symbolic names for wake-on-line modes as ETH_SS_WOL_MODES string set. Signed-off-by: Michal Kubecek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 30 ++++++++- include/uapi/linux/ethtool.h | 4 ++ include/uapi/linux/ethtool_netlink.h | 15 +++++ net/ethtool/Makefile | 2 +- net/ethtool/common.c | 12 ++++ net/ethtool/common.h | 1 + net/ethtool/netlink.c | 9 +++ net/ethtool/netlink.h | 1 + net/ethtool/strset.c | 5 ++ net/ethtool/wol.c | 99 ++++++++++++++++++++++++++++ 10 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/wol.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 2263f885d18f..5fd85e3ea96e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -187,6 +187,7 @@ Userspace to kernel: ``ETHTOOL_MSG_LINKSTATE_GET`` get link state ``ETHTOOL_MSG_DEBUG_GET`` get debugging settings ``ETHTOOL_MSG_DEBUG_SET`` set debugging settings + ``ETHTOOL_MSG_WOL_GET`` get wake-on-lan settings ===================================== ================================ Kernel to userspace: @@ -200,6 +201,7 @@ Kernel to userspace: ``ETHTOOL_MSG_LINKSTATE_GET_REPLY`` link state info ``ETHTOOL_MSG_DEBUG_GET_REPLY`` debugging settings ``ETHTOOL_MSG_DEBUG_NTF`` debugging settings notification + ``ETHTOOL_MSG_WOL_GET_REPLY`` wake-on-lan settings ===================================== ================================ ``GET`` requests are sent by userspace applications to retrieve device @@ -474,6 +476,32 @@ Request contents: enabled debugging message types for the device. +WOL_GET +======= + +Query device wake-on-lan settings. Unlike most "GET" type requests, +``ETHTOOL_MSG_WOL_GET`` requires (netns) ``CAP_NET_ADMIN`` privileges as it +(potentially) provides SecureOn(tm) password which is confidential. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_WOL_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_WOL_HEADER`` nested reply header + ``ETHTOOL_A_WOL_MODES`` bitset mask of enabled WoL modes + ``ETHTOOL_A_WOL_SOPASS`` binary SecureOn(tm) password + ==================================== ====== ========================== + +In reply, ``ETHTOOL_A_WOL_MODES`` mask consists of modes supported by the +device, value of modes which are enabled. ``ETHTOOL_A_WOL_SOPASS`` is only +included in reply if ``WAKE_MAGICSECURE`` mode is supported. + + Request translation =================== @@ -490,7 +518,7 @@ have their netlink replacement yet. ``ETHTOOL_MSG_LINKMODES_SET`` ``ETHTOOL_GDRVINFO`` n/a ``ETHTOOL_GREGS`` n/a - ``ETHTOOL_GWOL`` n/a + ``ETHTOOL_GWOL`` ``ETHTOOL_MSG_WOL_GET`` ``ETHTOOL_SWOL`` n/a ``ETHTOOL_GMSGLVL`` ``ETHTOOL_MSG_DEBUG_GET`` ``ETHTOOL_SMSGLVL`` ``ETHTOOL_MSG_DEBUG_SET`` diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 456fb2aa0fad..4295ebfa2f91 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -595,6 +595,7 @@ struct ethtool_pauseparam { * @ETH_SS_PHY_TUNABLES: PHY tunable names * @ETH_SS_LINK_MODES: link mode names * @ETH_SS_MSG_CLASSES: debug message class names + * @ETH_SS_WOL_MODES: wake-on-lan modes */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -608,6 +609,7 @@ enum ethtool_stringset { ETH_SS_PHY_TUNABLES, ETH_SS_LINK_MODES, ETH_SS_MSG_CLASSES, + ETH_SS_WOL_MODES, /* add new constants above here */ ETH_SS_COUNT @@ -1695,6 +1697,8 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ #define WAKE_FILTER (1 << 7) +#define WOL_MODE_COUNT 8 + /* L2-L4 network traffic flow types */ #define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ #define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 67a06b94bf28..dcc5c32dc018 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -22,6 +22,7 @@ enum { ETHTOOL_MSG_LINKSTATE_GET, ETHTOOL_MSG_DEBUG_GET, ETHTOOL_MSG_DEBUG_SET, + ETHTOOL_MSG_WOL_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -39,6 +40,7 @@ enum { ETHTOOL_MSG_LINKSTATE_GET_REPLY, ETHTOOL_MSG_DEBUG_GET_REPLY, ETHTOOL_MSG_DEBUG_NTF, + ETHTOOL_MSG_WOL_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -211,6 +213,19 @@ enum { ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 }; +/* WOL */ + +enum { + ETHTOOL_A_WOL_UNSPEC, + ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_WOL_MODES, /* bitset */ + ETHTOOL_A_WOL_SOPASS, /* binary */ + + /* add new constants above here */ + __ETHTOOL_A_WOL_CNT, + ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index c120c820a4f5..424545a4aaec 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -5,4 +5,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ - linkstate.o debug.o + linkstate.o debug.o wol.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index aa1183a65a76..636ec6d5110e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -190,6 +190,18 @@ const char netif_msg_class_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(netif_msg_class_names) == NETIF_MSG_CLASS_COUNT); +const char wol_mode_names[][ETH_GSTRING_LEN] = { + [const_ilog2(WAKE_PHY)] = "phy", + [const_ilog2(WAKE_UCAST)] = "ucast", + [const_ilog2(WAKE_MCAST)] = "mcast", + [const_ilog2(WAKE_BCAST)] = "bcast", + [const_ilog2(WAKE_ARP)] = "arp", + [const_ilog2(WAKE_MAGIC)] = "magic", + [const_ilog2(WAKE_MAGICSECURE)] = "magicsecure", + [const_ilog2(WAKE_FILTER)] = "filter", +}; +static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 064c5c3aa990..40ba74e0b9bb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -20,6 +20,7 @@ extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; +extern const char wol_mode_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 9a0a6c2f8dbb..eeb6d8594e1b 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -214,6 +214,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, + [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -679,6 +680,14 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_debug, }, + { + .cmd = ETHTOOL_MSG_WOL_GET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 772723e536e8..9fcd6f87b396 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -335,6 +335,7 @@ extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; +extern const struct ethnl_request_ops ethnl_wol_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 7a45c25355b8..8e5911887b4c 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -55,6 +55,11 @@ static const struct strset_info info_template[] = { .count = NETIF_MSG_CLASS_COUNT, .strings = netif_msg_class_names, }, + [ETH_SS_WOL_MODES] = { + .per_dev = false, + .count = WOL_MODE_COUNT, + .strings = wol_mode_names, + }, }; struct strset_req_info { diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c new file mode 100644 index 000000000000..7c9a1ef622ce --- /dev/null +++ b/net/ethtool/wol.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct wol_req_info { + struct ethnl_req_info base; +}; + +struct wol_reply_data { + struct ethnl_reply_data base; + struct ethtool_wolinfo wol; + bool show_sopass; +}; + +#define WOL_REPDATA(__reply_base) \ + container_of(__reply_base, struct wol_reply_data, base) + +static const struct nla_policy +wol_get_policy[ETHTOOL_A_WOL_MAX + 1] = { + [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_WOL_MODES] = { .type = NLA_REJECT }, + [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_REJECT }, +}; + +static int wol_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct wol_reply_data *data = WOL_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + dev->ethtool_ops->get_wol(dev, &data->wol); + ethnl_ops_complete(dev); + data->show_sopass = data->wol.supported & WAKE_MAGICSECURE; + + return 0; +} + +static int wol_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct wol_reply_data *data = WOL_REPDATA(reply_base); + int len; + + len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported, + WOL_MODE_COUNT, wol_mode_names, compact); + if (len < 0) + return len; + if (data->show_sopass) + len += nla_total_size(sizeof(data->wol.sopass)); + + return len; +} + +static int wol_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct wol_reply_data *data = WOL_REPDATA(reply_base); + int ret; + + ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts, + &data->wol.supported, WOL_MODE_COUNT, + wol_mode_names, compact); + if (ret < 0) + return ret; + if (data->show_sopass && + nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass), + data->wol.sopass)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_wol_request_ops = { + .request_cmd = ETHTOOL_MSG_WOL_GET, + .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, + .hdr_attr = ETHTOOL_A_WOL_HEADER, + .max_attr = ETHTOOL_A_WOL_MAX, + .req_info_size = sizeof(struct wol_req_info), + .reply_data_size = sizeof(struct wol_reply_data), + .request_policy = wol_get_policy, + + .prepare_data = wol_prepare_data, + .reply_size = wol_reply_size, + .fill_reply = wol_fill_reply, +}; -- cgit v1.2.3 From 8d425b19b305a77cb1ba67b84e7c1ca547de032f Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:16 +0100 Subject: ethtool: set wake-on-lan settings with WOL_SET request Implement WOL_SET netlink request to set wake-on-lan settings. This is equivalent to ETHTOOL_SWOL ioctl request. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 20 +++++++- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/netlink.c | 5 ++ net/ethtool/netlink.h | 1 + net/ethtool/wol.c | 76 ++++++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 5fd85e3ea96e..f16f74bbb546 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -188,6 +188,7 @@ Userspace to kernel: ``ETHTOOL_MSG_DEBUG_GET`` get debugging settings ``ETHTOOL_MSG_DEBUG_SET`` set debugging settings ``ETHTOOL_MSG_WOL_GET`` get wake-on-lan settings + ``ETHTOOL_MSG_WOL_SET`` set wake-on-lan settings ===================================== ================================ Kernel to userspace: @@ -502,6 +503,23 @@ device, value of modes which are enabled. ``ETHTOOL_A_WOL_SOPASS`` is only included in reply if ``WAKE_MAGICSECURE`` mode is supported. +WOL_SET +======= + +Set or update wake-on-lan settings. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_WOL_HEADER`` nested request header + ``ETHTOOL_A_WOL_MODES`` bitset enabled WoL modes + ``ETHTOOL_A_WOL_SOPASS`` binary SecureOn(tm) password + ==================================== ====== ========================== + +``ETHTOOL_A_WOL_SOPASS`` is only allowed for devices supporting +``WAKE_MAGICSECURE`` mode. + + Request translation =================== @@ -519,7 +537,7 @@ have their netlink replacement yet. ``ETHTOOL_GDRVINFO`` n/a ``ETHTOOL_GREGS`` n/a ``ETHTOOL_GWOL`` ``ETHTOOL_MSG_WOL_GET`` - ``ETHTOOL_SWOL`` n/a + ``ETHTOOL_SWOL`` ``ETHTOOL_MSG_WOL_SET`` ``ETHTOOL_GMSGLVL`` ``ETHTOOL_MSG_DEBUG_GET`` ``ETHTOOL_SMSGLVL`` ``ETHTOOL_MSG_DEBUG_SET`` ``ETHTOOL_NWAY_RST`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index dcc5c32dc018..59de35695521 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -23,6 +23,7 @@ enum { ETHTOOL_MSG_DEBUG_GET, ETHTOOL_MSG_DEBUG_SET, ETHTOOL_MSG_WOL_GET, + ETHTOOL_MSG_WOL_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index eeb6d8594e1b..2c375f9095fe 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -688,6 +688,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_WOL_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_wol, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9fcd6f87b396..60efd87686ad 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -340,5 +340,6 @@ extern const struct ethnl_request_ops ethnl_wol_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index 7c9a1ef622ce..a2724378fac4 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -97,3 +97,79 @@ const struct ethnl_request_ops ethnl_wol_request_ops = { .reply_size = wol_reply_size, .fill_reply = wol_fill_reply, }; + +/* WOL_SET */ + +static const struct nla_policy +wol_set_policy[ETHTOOL_A_WOL_MAX + 1] = { + [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED }, + [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY, + .len = SOPASS_MAX }, +}; + +int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) +{ + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + struct nlattr *tb[ETHTOOL_A_WOL_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_WOL_MAX, + wol_set_policy, info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_WOL_HEADER], + genl_info_net(info), info->extack, true); + if (ret < 0) + return ret; + dev = req_info.dev; + if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + dev->ethtool_ops->get_wol(dev, &wol); + ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, + tb[ETHTOOL_A_WOL_MODES], wol_mode_names, + info->extack, &mod); + if (ret < 0) + goto out_ops; + if (wol.wolopts & ~wol.supported) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], + "cannot enable unsupported WoL mode"); + ret = -EINVAL; + goto out_ops; + } + if (tb[ETHTOOL_A_WOL_SOPASS]) { + if (!(wol.supported & WAKE_MAGICSECURE)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_WOL_SOPASS], + "magicsecure not supported, cannot set password"); + ret = -EINVAL; + goto out_ops; + } + ethnl_update_binary(wol.sopass, sizeof(wol.sopass), + tb[ETHTOOL_A_WOL_SOPASS], &mod); + } + + if (!mod) + goto out_ops; + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + goto out_ops; + dev->wol_enabled = !!wol.wolopts; + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} -- cgit v1.2.3 From 67bffa79231f15ba372007972563cc8aa5e48bfa Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sun, 26 Jan 2020 23:11:19 +0100 Subject: ethtool: add WOL_NTF notification Send ETHTOOL_MSG_WOL_NTF notification whenever wake-on-lan settings of a device are modified using ETHTOOL_MSG_WOL_SET netlink message or ETHTOOL_SWOL ioctl request. As notifications can be received by anyone, do not include SecureOn(tm) password in notification messages. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 5 +++-- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/ioctl.c | 1 + net/ethtool/netlink.c | 2 ++ net/ethtool/wol.c | 4 +++- 5 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index f16f74bbb546..f1f868479ceb 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -193,7 +193,7 @@ Userspace to kernel: Kernel to userspace: - ===================================== ================================ + ===================================== ================================= ``ETHTOOL_MSG_STRSET_GET_REPLY`` string set contents ``ETHTOOL_MSG_LINKINFO_GET_REPLY`` link settings ``ETHTOOL_MSG_LINKINFO_NTF`` link settings notification @@ -203,7 +203,8 @@ Kernel to userspace: ``ETHTOOL_MSG_DEBUG_GET_REPLY`` debugging settings ``ETHTOOL_MSG_DEBUG_NTF`` debugging settings notification ``ETHTOOL_MSG_WOL_GET_REPLY`` wake-on-lan settings - ===================================== ================================ + ``ETHTOOL_MSG_WOL_NTF`` wake-on-lan settings notification + ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device information. They usually do not contain any message specific attributes. diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 59de35695521..7e0b460f872c 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -42,6 +42,7 @@ enum { ETHTOOL_MSG_DEBUG_GET_REPLY, ETHTOOL_MSG_DEBUG_NTF, ETHTOOL_MSG_WOL_GET_REPLY, + ETHTOOL_MSG_WOL_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 46e0b31782fc..b88dd14e41c6 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1316,6 +1316,7 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return ret; dev->wol_enabled = !!wol.wolopts; + ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); return 0; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 2c375f9095fe..180c194fab07 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -526,6 +526,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_DEBUG_NTF] = ðnl_debug_request_ops, + [ETHTOOL_MSG_WOL_NTF] = ðnl_wol_request_ops, }; /* default notification handler */ @@ -610,6 +611,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index a2724378fac4..e1b8a65b64c4 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -41,7 +41,8 @@ static int wol_prepare_data(const struct ethnl_req_info *req_base, return ret; dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); - data->show_sopass = data->wol.supported & WAKE_MAGICSECURE; + /* do not include password in notifications */ + data->show_sopass = info && (data->wol.supported & WAKE_MAGICSECURE); return 0; } @@ -165,6 +166,7 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) if (ret) goto out_ops; dev->wol_enabled = !!wol.wolopts; + ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); out_ops: ethnl_ops_complete(dev); -- cgit v1.2.3 From 6a7e25c7fb482dba3e80fec953f1907bcb24d52c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Jan 2020 09:20:28 +0200 Subject: net/core: Replace driver version to be kernel version In order to stop useless driver version bumps and unify output presented by ethtool -i, let's set default version string. As Linus said in [1]: "Things are supposed to be backwards and forwards compatible, because we don't accept breakage in user space anyway. So versioning is pointless, and only causes problems." They cause problems when users start to see version changes and expect specific set of features which will be different for stable@, vanilla and distribution kernels. Distribution kernels are based on some kernel version with extra patches on top, for example, in RedHat world this "extra" is a lot and for them your driver version say nothing. Users who run vanilla kernels won't use driver version information too, because running such kernels requires knowledge and understanding. Another set of problems are related to difference in versioning scheme and such doesn't allow to write meaningful automation which will work sanely on all ethtool capable devices. Before this change: [leonro@erver ~]$ ethtool -i eth0 driver: virtio_net version: 1.0.0 After this change and once ->version assignment will be deleted from virtio_net: [leonro@server ~]$ ethtool -i eth0 driver: virtio_net version: 5.5.0-rc6+ Link: https://lore.kernel.org/ksummit-discuss/CA+55aFx9A=5cc0QZ7CySC4F2K7eYaEfzkdYEc9JaNgCcV25=rg@mail.gmail.com/ Link: https://lore.kernel.org/linux-rdma/20200122152627.14903-1-michal.kalderon@marvell.com/T/#md460ff8f976c532a89d6860411c3c50bb811038b Link: https://lore.kernel.org/linux-rdma/20200127060835.GA570@unicorn.suse.cz Signed-off-by: Leon Romanovsky Acked-by: Shannon Nelson Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index b88dd14e41c6..b987052d91ef 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -655,6 +656,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; + strlcpy(info.version, UTS_RELEASE, sizeof(info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &info); } else if (dev->dev.parent && dev->dev.parent->driver) { -- cgit v1.2.3 From c312840cd79061af37158cb42590931cfa364c1b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Jan 2020 13:49:33 +0100 Subject: Revert "pktgen: Allow configuration of IPv6 source address range" This reverts commit 7786a1af2a6bceb07860ec720e74714004438834. It causes build failures on 32-bit, for example: net/core/pktgen.o: In function `mod_cur_headers': >> pktgen.c:(.text.mod_cur_headers+0xba0): undefined reference to `__umoddi3' Signed-off-by: David S. Miller --- net/core/pktgen.c | 98 ------------------------------------------------------- 1 file changed, 98 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 890be1b4877e..294bfcf0ce0e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -323,10 +323,6 @@ struct pktgen_dev { struct in6_addr max_in6_daddr; struct in6_addr min_in6_saddr; struct in6_addr max_in6_saddr; - u64 max_in6_h; - u64 max_in6_l; - u64 min_in6_h; - u64 min_in6_l; /* If we're doing ranges, random or incremental, then this * defines the min/max for those ranges. @@ -1359,59 +1355,6 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } - if (!strcmp(name, "src6_min")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); - if (len < 0) - return len; - - pkt_dev->flags |= F_IPV6; - - if (copy_from_user(buf, &user_buffer[i], len)) - return -EFAULT; - buf[len] = 0; - - in6_pton(buf, -1, pkt_dev->min_in6_saddr.s6_addr, -1, NULL); - snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_saddr); - - memcpy(&pkt_dev->min_in6_h, pkt_dev->min_in6_saddr.s6_addr, 8); - memcpy(&pkt_dev->min_in6_l, pkt_dev->min_in6_saddr.s6_addr + 8, 8); - pkt_dev->min_in6_h = be64_to_cpu(pkt_dev->min_in6_h); - pkt_dev->min_in6_l = be64_to_cpu(pkt_dev->min_in6_l); - - pkt_dev->cur_in6_saddr = pkt_dev->min_in6_saddr; - if (debug) - pr_debug("src6_min set to: %s\n", buf); - - i += len; - sprintf(pg_result, "OK: src6_min=%s", buf); - return count; - } - if (!strcmp(name, "src6_max")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); - if (len < 0) - return len; - - pkt_dev->flags |= F_IPV6; - - if (copy_from_user(buf, &user_buffer[i], len)) - return -EFAULT; - buf[len] = 0; - - in6_pton(buf, -1, pkt_dev->max_in6_saddr.s6_addr, -1, NULL); - snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_saddr); - - memcpy(&pkt_dev->max_in6_h, pkt_dev->max_in6_saddr.s6_addr, 8); - memcpy(&pkt_dev->max_in6_l, pkt_dev->max_in6_saddr.s6_addr + 8, 8); - pkt_dev->max_in6_h = be64_to_cpu(pkt_dev->max_in6_h); - pkt_dev->max_in6_l = be64_to_cpu(pkt_dev->max_in6_l); - - if (debug) - pr_debug("src6_max set to: %s\n", buf); - - i += len; - sprintf(pg_result, "OK: src6_max=%s", buf); - return count; - } if (!strcmp(name, "src6")) { len = strn_len(&user_buffer[i], sizeof(buf) - 1); if (len < 0) @@ -2343,45 +2286,6 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues; } -/* generate ipv6 source addr */ -static void set_src_in6_addr(struct pktgen_dev *pkt_dev) -{ - u64 min6, max6, rand, i; - struct in6_addr addr6; - __be64 addr_l, *t; - - min6 = pkt_dev->min_in6_l; - max6 = pkt_dev->max_in6_l; - - /* only generate source address in least significant 64 bits range - * most significant 64 bits must be equal - */ - if (pkt_dev->max_in6_h != pkt_dev->min_in6_h || min6 >= max6) - return; - - addr6 = pkt_dev->min_in6_saddr; - t = (__be64 *)addr6.s6_addr + 1; - - if (pkt_dev->flags & F_IPSRC_RND) { - do { - prandom_bytes(&rand, sizeof(rand)); - rand = rand % (max6 - min6) + min6; - addr_l = cpu_to_be64(rand); - memcpy(t, &addr_l, 8); - } while (ipv6_addr_loopback(&addr6) || - ipv6_addr_v4mapped(&addr6) || - ipv6_addr_is_multicast(&addr6)); - } else { - addr6 = pkt_dev->cur_in6_saddr; - i = be64_to_cpu(*t); - if (++i > max6) - i = min6; - addr_l = cpu_to_be64(i); - memcpy(t, &addr_l, 8); - } - pkt_dev->cur_in6_saddr = addr6; -} - /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2550,8 +2454,6 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } } else { /* IPV6 * */ - set_src_in6_addr(pkt_dev); - if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) { int i; -- cgit v1.2.3