From eb7935830d00b9e0c4ca11382143ea2320eb45c2 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 12 Dec 2017 16:02:50 +0200 Subject: net: bridge: use rhashtable for fdbs Before this patch the bridge used a fixed 256 element hash table which was fine for small use cases (in my tests it starts to degrade above 1000 entries), but it wasn't enough for medium or large scale deployments. Modern setups have thousands of participants in a single bridge, even only enabling vlans and adding a few thousand vlan entries will cause a few thousand fdbs to be automatically inserted per participating port. So we need to scale the fdb table considerably to cope with modern workloads, and this patch converts it to use a rhashtable for its operations thus improving the bridge scalability. Tests show the following results (10 runs each), at up to 1000 entries rhashtable is ~3% slower, at 2000 rhashtable is 30% faster, at 3000 it is 2 times faster and at 30000 it is 50 times faster. Obviously this happens because of the properties of the two constructs and is expected, rhashtable keeps pretty much a constant time even with 10000000 entries (tested), while the fixed hash table struggles considerably even above 10000. As a side effect this also reduces the net_bridge struct size from 3248 bytes to 1344 bytes. Also note that the key struct is 8 bytes. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/trace/events/bridge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h index 1bee3e7fdf32..8ea966448b58 100644 --- a/include/trace/events/bridge.h +++ b/include/trace/events/bridge.h @@ -82,8 +82,8 @@ TRACE_EVENT(fdb_delete, TP_fast_assign( __assign_str(br_dev, br->dev->name); __assign_str(dev, f->dst ? f->dst->dev->name : "null"); - memcpy(__entry->addr, f->addr.addr, ETH_ALEN); - __entry->vid = f->vlan_id; + memcpy(__entry->addr, f->key.addr.addr, ETH_ALEN); + __entry->vid = f->key.vlan_id; ), TP_printk("br_dev %s dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u", -- cgit v1.2.3 From d7b850a7dedf163fbfdb7aa20226d8aff2836338 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 20 Dec 2017 11:12:50 +0800 Subject: tcp: Export to userspace the TCP state names for the trace events The TCP trace events (specifically tcp_set_state), maps emums to symbol names via __print_symbolic(). But this only works for reading trace events from the tracefs trace files. If perf or trace-cmd were to record these events, the event format file does not convert the enum names into numbers, and you get something like: __print_symbolic(REC->oldstate, { TCP_ESTABLISHED, "TCP_ESTABLISHED" }, { TCP_SYN_SENT, "TCP_SYN_SENT" }, { TCP_SYN_RECV, "TCP_SYN_RECV" }, { TCP_FIN_WAIT1, "TCP_FIN_WAIT1" }, { TCP_FIN_WAIT2, "TCP_FIN_WAIT2" }, { TCP_TIME_WAIT, "TCP_TIME_WAIT" }, { TCP_CLOSE, "TCP_CLOSE" }, { TCP_CLOSE_WAIT, "TCP_CLOSE_WAIT" }, { TCP_LAST_ACK, "TCP_LAST_ACK" }, { TCP_LISTEN, "TCP_LISTEN" }, { TCP_CLOSING, "TCP_CLOSING" }, { TCP_NEW_SYN_RECV, "TCP_NEW_SYN_RECV" }) Where trace-cmd and perf do not know the values of those enums. Use the TRACE_DEFINE_ENUM() macros that will have the trace events convert the enum strings into their values at system boot. This will allow perf and trace-cmd to see actual numbers and not enums: __print_symbolic(REC->oldstate, { 1, "TCP_ESTABLISHED" }, { 2, "TCP_SYN_SENT" }, { 3, "TCP_SYN_RECV" }, { 4, "TCP_FIN_WAIT1" }, { 5, "TCP_FIN_WAIT2" }, { 6, "TCP_TIME_WAIT" }, { 7, "TCP_CLOSE" }, { 8, "TCP_CLOSE_WAIT" }, { 9, "TCP_LAST_ACK" }, { 10, "TCP_LISTEN" }, { 11, "TCP_CLOSING" }, { 12, "TCP_NEW_SYN_RECV" }) Signed-off-by: Steven Rostedt (VMware) Acked-by: Song Liu Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 07cccca6cbf1..ec52fb3b4ae0 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,21 +9,36 @@ #include #include +#define tcp_state_names \ + EM(TCP_ESTABLISHED) \ + EM(TCP_SYN_SENT) \ + EM(TCP_SYN_RECV) \ + EM(TCP_FIN_WAIT1) \ + EM(TCP_FIN_WAIT2) \ + EM(TCP_TIME_WAIT) \ + EM(TCP_CLOSE) \ + EM(TCP_CLOSE_WAIT) \ + EM(TCP_LAST_ACK) \ + EM(TCP_LISTEN) \ + EM(TCP_CLOSING) \ + EMe(TCP_NEW_SYN_RECV) \ + +/* enums need to be exported to user space */ +#undef EM +#undef EMe +#define EM(a) TRACE_DEFINE_ENUM(a); +#define EMe(a) TRACE_DEFINE_ENUM(a); + +tcp_state_names + +#undef EM +#undef EMe +#define EM(a) tcp_state_name(a), +#define EMe(a) tcp_state_name(a) + #define tcp_state_name(state) { state, #state } #define show_tcp_state_name(val) \ - __print_symbolic(val, \ - tcp_state_name(TCP_ESTABLISHED), \ - tcp_state_name(TCP_SYN_SENT), \ - tcp_state_name(TCP_SYN_RECV), \ - tcp_state_name(TCP_FIN_WAIT1), \ - tcp_state_name(TCP_FIN_WAIT2), \ - tcp_state_name(TCP_TIME_WAIT), \ - tcp_state_name(TCP_CLOSE), \ - tcp_state_name(TCP_CLOSE_WAIT), \ - tcp_state_name(TCP_LAST_ACK), \ - tcp_state_name(TCP_LISTEN), \ - tcp_state_name(TCP_CLOSING), \ - tcp_state_name(TCP_NEW_SYN_RECV)) + __print_symbolic(val, tcp_state_names) /* * tcp event with arguments sk and skb -- cgit v1.2.3 From 563e0bb0dc74b3ca888e24f8c08f0239fe4016b0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 20 Dec 2017 11:12:51 +0800 Subject: net: tracepoint: replace tcp_set_state tracepoint with inet_sock_set_state tracepoint As sk_state is a common field for struct sock, so the state transition tracepoint should not be a TCP specific feature. Currently it traces all AF_INET state transition, so I rename this tracepoint to inet_sock_set_state tracepoint with some minor changes and move it into trace/events/sock.h. We dont need to create a file named trace/events/inet_sock.h for this one single tracepoint. Two helpers are introduced to trace sk_state transition - void inet_sk_state_store(struct sock *sk, int newstate); - void inet_sk_set_state(struct sock *sk, int state); As trace header should not be included in other header files, so they are defined in sock.c. The protocol such as SCTP maybe compiled as a ko, hence export inet_sk_set_state(). Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 + include/trace/events/sock.h | 107 ++++++++++++++++++++++++++++++++++++++++ include/trace/events/tcp.h | 31 ------------ net/ipv4/af_inet.c | 14 ++++++ net/ipv4/inet_connection_sock.c | 6 +-- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 6 +-- 7 files changed, 128 insertions(+), 40 deletions(-) (limited to 'include/trace') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 39efb968b7a4..a3431a4ff9cc 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -290,6 +290,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #endif int inet_sk_rebuild_header(struct sock *sk); +void inet_sk_set_state(struct sock *sk, int state); +void inet_sk_state_store(struct sock *sk, int newstate); static inline unsigned int __inet_ehashfn(const __be32 laddr, const __u16 lport, diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index ec4dade24466..3b9094a07b80 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -6,7 +6,50 @@ #define _TRACE_SOCK_H #include +#include #include +#include +#include + +/* The protocol traced by sock_set_state */ +#define inet_protocol_names \ + EM(IPPROTO_TCP) \ + EM(IPPROTO_DCCP) \ + EMe(IPPROTO_SCTP) + +#define tcp_state_names \ + EM(TCP_ESTABLISHED) \ + EM(TCP_SYN_SENT) \ + EM(TCP_SYN_RECV) \ + EM(TCP_FIN_WAIT1) \ + EM(TCP_FIN_WAIT2) \ + EM(TCP_TIME_WAIT) \ + EM(TCP_CLOSE) \ + EM(TCP_CLOSE_WAIT) \ + EM(TCP_LAST_ACK) \ + EM(TCP_LISTEN) \ + EM(TCP_CLOSING) \ + EMe(TCP_NEW_SYN_RECV) + +/* enums need to be exported to user space */ +#undef EM +#undef EMe +#define EM(a) TRACE_DEFINE_ENUM(a); +#define EMe(a) TRACE_DEFINE_ENUM(a); + +inet_protocol_names +tcp_state_names + +#undef EM +#undef EMe +#define EM(a) { a, #a }, +#define EMe(a) { a, #a } + +#define show_inet_protocol_name(val) \ + __print_symbolic(val, inet_protocol_names) + +#define show_tcp_state_name(val) \ + __print_symbolic(val, tcp_state_names) TRACE_EVENT(sock_rcvqueue_full, @@ -63,6 +106,70 @@ TRACE_EVENT(sock_exceed_buf_limit, __entry->rmem_alloc) ); +TRACE_EVENT(inet_sock_set_state, + + TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), + + TP_ARGS(sk, oldstate, newstate), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(int, oldstate) + __field(int, newstate) + __field(__u16, sport) + __field(__u16, dport) + __field(__u8, protocol) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + __entry->oldstate = oldstate; + __entry->newstate = newstate; + + __entry->protocol = sk->sk_protocol; + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = sk->sk_v6_rcv_saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = sk->sk_v6_daddr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4" + "saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", + show_inet_protocol_name(__entry->protocol), + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + show_tcp_state_name(__entry->oldstate), + show_tcp_state_name(__entry->newstate)) +); + #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index ec52fb3b4ae0..8e88a1671538 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,37 +9,6 @@ #include #include -#define tcp_state_names \ - EM(TCP_ESTABLISHED) \ - EM(TCP_SYN_SENT) \ - EM(TCP_SYN_RECV) \ - EM(TCP_FIN_WAIT1) \ - EM(TCP_FIN_WAIT2) \ - EM(TCP_TIME_WAIT) \ - EM(TCP_CLOSE) \ - EM(TCP_CLOSE_WAIT) \ - EM(TCP_LAST_ACK) \ - EM(TCP_LISTEN) \ - EM(TCP_CLOSING) \ - EMe(TCP_NEW_SYN_RECV) \ - -/* enums need to be exported to user space */ -#undef EM -#undef EMe -#define EM(a) TRACE_DEFINE_ENUM(a); -#define EMe(a) TRACE_DEFINE_ENUM(a); - -tcp_state_names - -#undef EM -#undef EMe -#define EM(a) tcp_state_name(a), -#define EMe(a) tcp_state_name(a) - -#define tcp_state_name(state) { state, #state } -#define show_tcp_state_name(val) \ - __print_symbolic(val, tcp_state_names) - /* * tcp event with arguments sk and skb * diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f00499a46927..bab98a4fedad 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -121,6 +121,7 @@ #endif #include +#include /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -1220,6 +1221,19 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); +void inet_sk_set_state(struct sock *sk, int state) +{ + trace_inet_sock_set_state(sk, sk->sk_state, state); + sk->sk_state = state; +} +EXPORT_SYMBOL(inet_sk_set_state); + +void inet_sk_state_store(struct sock *sk, int newstate) +{ + trace_inet_sock_set_state(sk, sk->sk_state, newstate); + smp_store_release(&sk->sk_state, newstate); +} + struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4ca46dc08e63..f460fc04aa66 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -783,7 +783,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_state = TCP_SYN_RECV; + inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; @@ -877,7 +877,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk_state_store(sk, TCP_LISTEN); + inet_sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -888,7 +888,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) return 0; } - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f6f58108b4c5..37b7da0b975d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -544,7 +544,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { percpu_counter_inc(sk->sk_prot->orphan_count); - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c470fec9062f..d408fb41c804 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,8 +283,6 @@ #include #include -#include - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -2040,8 +2038,6 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; - trace_tcp_set_state(sk, oldstate, state); - switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2065,7 +2061,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + inet_sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -- cgit v1.2.3 From 4f36b935ec83e592fd5e866f382c0133b3cffc60 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 24 Dec 2017 23:10:39 +0800 Subject: net/trace: fix printk format in inet_sock_set_state There's a space character missed in the printk messages. Put the message into one line could simplify searching for the messages in the kernel source. Fixes: 563e0bb0dc74("net: tracepoint: replace tcp_set_state tracepoint with inet_sock_set_state tracepoint") Cc: Sergei Shtylyov Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/trace/events/sock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3b9094a07b80..3537c5f34d77 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -160,8 +160,7 @@ TRACE_EVENT(inet_sock_set_state, } ), - TP_printk("protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4" - "saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", + TP_printk("protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, -- cgit v1.2.3 From c3fde1bd28f7c720d7bc587e85e54706df4f8163 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 29 Dec 2017 11:45:51 +0900 Subject: net: tcp: Add trace events for TCP congestion window tracing This adds an event to trace TCP stat variables with slightly intrusive trace-event. This uses ftrace/perf event log buffer to trace those state, no needs to prepare own ring-buffer, nor custom user apps. User can use ftrace to trace this event as below; # cd /sys/kernel/debug/tracing # echo 1 > events/tcp/tcp_probe/enable (run workloads) # cat trace Signed-off-by: Masami Hiramatsu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 3 ++ 2 files changed, 100 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index bb00459d2d4d..b5ae3fbb74c8 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp @@ -8,6 +9,7 @@ #include #include #include +#include #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ @@ -254,6 +256,101 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); + +#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \ + do { \ + struct sockaddr_in *v4 = (void *)__entry->saddr; \ + \ + v4->sin_family = AF_INET; \ + v4->sin_port = inet->inet_sport; \ + v4->sin_addr.s_addr = inet->inet_saddr; \ + v4 = (void *)__entry->daddr; \ + v4->sin_family = AF_INET; \ + v4->sin_port = inet->inet_dport; \ + v4->sin_addr.s_addr = inet->inet_daddr; \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) + +#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ + do { \ + if (sk->sk_family == AF_INET6) { \ + struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ + \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = inet->inet_sport; \ + v6->sin6_addr = inet6_sk(sk)->saddr; \ + v6 = (void *)__entry->daddr; \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = inet->inet_dport; \ + v6->sin6_addr = sk->sk_v6_daddr; \ + } else \ + TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \ + } while (0) + +#else + +#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ + TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); + +#endif + +TRACE_EVENT(tcp_probe, + + TP_PROTO(struct sock *sk, struct sk_buff *skb), + + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(__u16, sport) + __field(__u16, dport) + __field(__u32, mark) + __field(__u16, length) + __field(__u32, snd_nxt) + __field(__u32, snd_una) + __field(__u32, snd_cwnd) + __field(__u32, ssthresh) + __field(__u32, snd_wnd) + __field(__u32, srtt) + __field(__u32, rcv_wnd) + ), + + TP_fast_assign( + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + /* For filtering use */ + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->mark = skb->mark; + + __entry->length = skb->len; + __entry->snd_nxt = tp->snd_nxt; + __entry->snd_una = tp->snd_una; + __entry->snd_cwnd = tp->snd_cwnd; + __entry->snd_wnd = tp->snd_wnd; + __entry->rcv_wnd = tp->rcv_wnd; + __entry->ssthresh = tcp_current_ssthresh(sk); + __entry->srtt = tp->srtt_us >> 3; + ), + + TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x " + "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u " + "rcv_wnd=%u", + __entry->saddr, __entry->daddr, __entry->mark, + __entry->length, __entry->snd_nxt, __entry->snd_una, + __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, + __entry->srtt, __entry->rcv_wnd) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4d55c4b338ee..ff71b18d9682 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); + /* TCP congestion window tracking */ + trace_tcp_probe(sk, skb); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); -- cgit v1.2.3 From 103d750c88fe6b42dbe7abc4d204027f343ee125 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 29 Dec 2017 11:46:51 +0900 Subject: net: sctp: Add SCTP ACK tracking trace event Add SCTP ACK tracking trace event to trace the changes of SCTP association state in response to incoming packets. It is used for debugging SCTP congestion control algorithms, and will replace sctp_probe module. Note that this event a bit tricky. Since this consists of 2 events (sctp_probe and sctp_probe_path) so you have to enable both events as below. # cd /sys/kernel/debug/tracing # echo 1 > events/sctp/sctp_probe/enable # echo 1 > events/sctp/sctp_probe_path/enable Or, you can enable all the events under sctp. # echo 1 > events/sctp/enable Since sctp_probe_path event is always invoked from sctp_probe event, you can not see any output if you only enable sctp_probe_path. Signed-off-by: Masami Hiramatsu Signed-off-by: David S. Miller --- include/trace/events/sctp.h | 99 +++++++++++++++++++++++++++++++++++++++++++++ net/sctp/sm_statefuns.c | 5 +++ 2 files changed, 104 insertions(+) create mode 100644 include/trace/events/sctp.h (limited to 'include/trace') diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h new file mode 100644 index 000000000000..7475c7be165a --- /dev/null +++ b/include/trace/events/sctp.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sctp + +#if !defined(_TRACE_SCTP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SCTP_H + +#include +#include + +TRACE_EVENT(sctp_probe_path, + + TP_PROTO(struct sctp_transport *sp, + const struct sctp_association *asoc), + + TP_ARGS(sp, asoc), + + TP_STRUCT__entry( + __field(__u64, asoc) + __field(__u32, primary) + __array(__u8, ipaddr, sizeof(union sctp_addr)) + __field(__u32, state) + __field(__u32, cwnd) + __field(__u32, ssthresh) + __field(__u32, flight_size) + __field(__u32, partial_bytes_acked) + __field(__u32, pathmtu) + ), + + TP_fast_assign( + __entry->asoc = (unsigned long)asoc; + __entry->primary = (sp == asoc->peer.primary_path); + memcpy(__entry->ipaddr, &sp->ipaddr, sizeof(union sctp_addr)); + __entry->state = sp->state; + __entry->cwnd = sp->cwnd; + __entry->ssthresh = sp->ssthresh; + __entry->flight_size = sp->flight_size; + __entry->partial_bytes_acked = sp->partial_bytes_acked; + __entry->pathmtu = sp->pathmtu; + ), + + TP_printk("asoc=%#llx%s ipaddr=%pISpc state=%u cwnd=%u ssthresh=%u " + "flight_size=%u partial_bytes_acked=%u pathmtu=%u", + __entry->asoc, __entry->primary ? "(*)" : "", + __entry->ipaddr, __entry->state, __entry->cwnd, + __entry->ssthresh, __entry->flight_size, + __entry->partial_bytes_acked, __entry->pathmtu) +); + +TRACE_EVENT(sctp_probe, + + TP_PROTO(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk), + + TP_ARGS(ep, asoc, chunk), + + TP_STRUCT__entry( + __field(__u64, asoc) + __field(__u32, mark) + __field(__u16, bind_port) + __field(__u16, peer_port) + __field(__u32, pathmtu) + __field(__u32, rwnd) + __field(__u16, unack_data) + ), + + TP_fast_assign( + struct sk_buff *skb = chunk->skb; + + __entry->asoc = (unsigned long)asoc; + __entry->mark = skb->mark; + __entry->bind_port = ep->base.bind_addr.port; + __entry->peer_port = asoc->peer.port; + __entry->pathmtu = asoc->pathmtu; + __entry->rwnd = asoc->peer.rwnd; + __entry->unack_data = asoc->unack_data; + + if (trace_sctp_probe_path_enabled()) { + struct sctp_transport *sp; + + list_for_each_entry(sp, &asoc->peer.transport_addr_list, + transports) { + trace_sctp_probe_path(sp, asoc); + } + } + ), + + TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d " + "rwnd=%u unack_data=%d", + __entry->asoc, __entry->mark, __entry->bind_port, + __entry->peer_port, __entry->pathmtu, __entry->rwnd, + __entry->unack_data) +); + +#endif /* _TRACE_SCTP_H */ + +/* This part must be outside protection */ +#include diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 541f34735346..eb7905ffe5f2 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -59,6 +59,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + static struct sctp_packet *sctp_abort_pkt_new( struct net *net, const struct sctp_endpoint *ep, @@ -3219,6 +3222,8 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net, struct sctp_sackhdr *sackh; __u32 ctsn; + trace_sctp_probe(ep, asoc, chunk); + if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); -- cgit v1.2.3 From ee549be6f061188f306133e3a66ce3d3c6758811 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 29 Dec 2017 11:47:55 +0900 Subject: net: dccp: Add DCCP sendmsg trace event Add DCCP sendmsg trace event (dccp/dccp_probe) for replacing dccpprobe. User can trace this event via ftrace or perftools. Signed-off-by: Masami Hiramatsu Signed-off-by: David S. Miller --- include/trace/events/net_probe_common.h | 44 +++++++++++++++++ include/trace/events/tcp.h | 39 +-------------- net/dccp/Makefile | 3 ++ net/dccp/proto.c | 5 ++ net/dccp/trace.h | 84 +++++++++++++++++++++++++++++++++ 5 files changed, 137 insertions(+), 38 deletions(-) create mode 100644 include/trace/events/net_probe_common.h create mode 100644 net/dccp/trace.h (limited to 'include/trace') diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h new file mode 100644 index 000000000000..3930119cab08 --- /dev/null +++ b/include/trace/events/net_probe_common.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(_TRACE_NET_PROBE_COMMON_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NET_PROBE_COMMON_H + +#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \ + do { \ + struct sockaddr_in *v4 = (void *)__entry->saddr; \ + \ + v4->sin_family = AF_INET; \ + v4->sin_port = inet->inet_sport; \ + v4->sin_addr.s_addr = inet->inet_saddr; \ + v4 = (void *)__entry->daddr; \ + v4->sin_family = AF_INET; \ + v4->sin_port = inet->inet_dport; \ + v4->sin_addr.s_addr = inet->inet_daddr; \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) + +#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ + do { \ + if (sk->sk_family == AF_INET6) { \ + struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ + \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = inet->inet_sport; \ + v6->sin6_addr = inet6_sk(sk)->saddr; \ + v6 = (void *)__entry->daddr; \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = inet->inet_dport; \ + v6->sin6_addr = sk->sk_v6_daddr; \ + } else \ + TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \ + } while (0) + +#else + +#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ + TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); + +#endif + +#endif diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index b5ae3fbb74c8..878b2be7ce77 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -256,44 +256,7 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); - -#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \ - do { \ - struct sockaddr_in *v4 = (void *)__entry->saddr; \ - \ - v4->sin_family = AF_INET; \ - v4->sin_port = inet->inet_sport; \ - v4->sin_addr.s_addr = inet->inet_saddr; \ - v4 = (void *)__entry->daddr; \ - v4->sin_family = AF_INET; \ - v4->sin_port = inet->inet_dport; \ - v4->sin_addr.s_addr = inet->inet_daddr; \ - } while (0) - -#if IS_ENABLED(CONFIG_IPV6) - -#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ - do { \ - if (sk->sk_family == AF_INET6) { \ - struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ - \ - v6->sin6_family = AF_INET6; \ - v6->sin6_port = inet->inet_sport; \ - v6->sin6_addr = inet6_sk(sk)->saddr; \ - v6 = (void *)__entry->daddr; \ - v6->sin6_family = AF_INET6; \ - v6->sin6_port = inet->inet_dport; \ - v6->sin6_addr = sk->sk_v6_daddr; \ - } else \ - TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \ - } while (0) - -#else - -#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \ - TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); - -#endif +#include TRACE_EVENT(tcp_probe, diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 2e7b56097bc4..4215f13a63af 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -27,3 +27,6 @@ dccp-$(CONFIG_SYSCTL) += sysctl.o dccp_diag-y := diag.o dccp_probe-y := probe.o + +# build with local directory for trace.h +CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7a75a1d3568b..fa7e92e08920 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -38,6 +38,9 @@ #include "dccp.h" #include "feat.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); @@ -761,6 +764,8 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int rc, size; long timeo; + trace_dccp_probe(sk, len); + if (len > dp->dccps_mss_cache) return -EMSGSIZE; diff --git a/net/dccp/trace.h b/net/dccp/trace.h new file mode 100644 index 000000000000..5062421beee9 --- /dev/null +++ b/net/dccp/trace.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dccp + +#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DCCP_H + +#include +#include "dccp.h" +#include "ccids/ccid3.h" +#include +#include + +TRACE_EVENT(dccp_probe, + + TP_PROTO(struct sock *sk, size_t size), + + TP_ARGS(sk, size), + + TP_STRUCT__entry( + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, size) + __field(__u16, tx_s) + __field(__u32, tx_rtt) + __field(__u32, tx_p) + __field(__u32, tx_x_calc) + __field(__u64, tx_x_recv) + __field(__u64, tx_x) + __field(__u32, tx_t_ipi) + ), + + TP_fast_assign( + const struct inet_sock *inet = inet_sk(sk); + struct ccid3_hc_tx_sock *hc = NULL; + + if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) + hc = ccid3_hc_tx_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + /* For filtering use */ + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + __entry->size = size; + if (hc) { + __entry->tx_s = hc->tx_s; + __entry->tx_rtt = hc->tx_rtt; + __entry->tx_p = hc->tx_p; + __entry->tx_x_calc = hc->tx_x_calc; + __entry->tx_x_recv = hc->tx_x_recv >> 6; + __entry->tx_x = hc->tx_x >> 6; + __entry->tx_t_ipi = hc->tx_t_ipi; + } else { + __entry->tx_s = 0; + memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi - + (void *)&__entry->tx_rtt + + sizeof(__entry->tx_t_ipi)); + } + ), + + TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " + "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", + __entry->saddr, __entry->daddr, __entry->size, + __entry->tx_s, __entry->tx_rtt, __entry->tx_p, + __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, + __entry->tx_t_ipi) +); + +#endif /* _TRACE_TCP_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include -- cgit v1.2.3 From 0c3b34d804947fef7fb9e74912b7b7563729231e Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 7 Jan 2018 14:31:47 +0800 Subject: net: tracepoint: exposing sk_faimily in tracepoint inet_sock_set_state As of now, there're two sk_family are traced with sock:inet_sock_set_state, which are AF_INET and AF_INET6. So the sk_family are exposed as well. Then we can conveniently use it to do the filter. Both sk_family and sk_protocol are showed in the printk message, so we need not expose them as tracepoint arguments. Suggested-by: Brendan Gregg Suggested-by: Song Liu Signed-off-by: Yafang Shao Reviewed-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/sock.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3537c5f34d77..3176a3931107 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -11,7 +11,11 @@ #include #include -/* The protocol traced by sock_set_state */ +#define family_names \ + EM(AF_INET) \ + EMe(AF_INET6) + +/* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ @@ -37,6 +41,7 @@ #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); +family_names inet_protocol_names tcp_state_names @@ -45,6 +50,9 @@ tcp_state_names #define EM(a) { a, #a }, #define EMe(a) { a, #a } +#define show_family_name(val) \ + __print_symbolic(val, family_names) + #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) @@ -118,6 +126,7 @@ TRACE_EVENT(inet_sock_set_state, __field(int, newstate) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __field(__u8, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) @@ -134,6 +143,7 @@ TRACE_EVENT(inet_sock_set_state, __entry->oldstate = oldstate; __entry->newstate = newstate; + __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); @@ -160,7 +170,8 @@ TRACE_EVENT(inet_sock_set_state, } ), - TP_printk("protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", + TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", + show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, -- cgit v1.2.3