From c10d73671ad30f54692f7f69f0e09e75d3a8926a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jan 2013 15:26:34 -0800 Subject: softirq: reduce latencies In various network workloads, __do_softirq() latencies can be up to 20 ms if HZ=1000, and 200 ms if HZ=100. This is because we iterate 10 times in the softirq dispatcher, and some actions can consume a lot of cycles. This patch changes the fallback to ksoftirqd condition to : - A time limit of 2 ms. - need_resched() being set on current task When one of this condition is met, we wakeup ksoftirqd for further softirq processing if we still have pending softirqs. Using need_resched() as the only condition can trigger RCU stalls, as we can keep BH disabled for too long. I ran several benchmarks and got no significant difference in throughput, but a very significant reduction of latencies (one order of magnitude) : In following bench, 200 antagonist "netperf -t TCP_RR" are started in background, using all available cpus. Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC IRQ (hard+soft) Before patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=550110.424 MIN_LATENCY=146858 MAX_LATENCY=997109 P50_LATENCY=305000 P90_LATENCY=550000 P99_LATENCY=710000 MEAN_LATENCY=376989.12 STDDEV_LATENCY=184046.92 After patch : # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind RT_LATENCY=40545.492 MIN_LATENCY=9834 MAX_LATENCY=78366 P50_LATENCY=33583 P90_LATENCY=59000 P99_LATENCY=69000 MEAN_LATENCY=38364.67 STDDEV_LATENCY=12865.26 Signed-off-by: Eric Dumazet Cc: David Miller Cc: Tom Herbert Cc: Ben Hutchings Signed-off-by: David S. Miller --- kernel/softirq.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe789..47cb991c6ba4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. + * We restart softirq processing for at most 2 ms, + * and if need_resched() is not set. * - * This number has been established via experimentation. + * These limits have been established via experimentation. * The two things to balance is latency against fairness - * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) asmlinkage void __do_softirq(void) { struct softirq_action *h; __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; + unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; @@ -264,11 +264,12 @@ restart: local_irq_disable(); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending) { + if (time_before(jiffies, end) && !need_resched()) + goto restart; - if (pending) wakeup_softirqd(); + } lockdep_softirq_exit(); -- cgit v1.2.3 From ca2eb5679f8ddffff60156af42595df44a315ef0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 5 Feb 2013 07:25:17 +0000 Subject: tcp: remove Appropriate Byte Count support TCP Appropriate Byte Count was added by me, but later disabled. There is no point in maintaining it since it is a potential source of bugs and Linux already implements other better window protection heuristics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 11 ----------- include/linux/tcp.h | 1 - include/net/tcp.h | 1 - kernel/sysctl_binary.c | 1 - net/ipv4/sysctl_net_ipv4.c | 7 ------- net/ipv4/tcp.c | 1 - net/ipv4/tcp_cong.c | 30 +----------------------------- net/ipv4/tcp_input.c | 15 --------------- net/ipv4/tcp_minisocks.c | 1 - 9 files changed, 1 insertion(+), 67 deletions(-) (limited to 'kernel') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 19ac1802bfd4..dc2dc87d2557 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -130,17 +130,6 @@ somaxconn - INTEGER Defaults to 128. See also tcp_max_syn_backlog for additional tuning for TCP sockets. -tcp_abc - INTEGER - Controls Appropriate Byte Count (ABC) defined in RFC3465. - ABC is a way of increasing congestion window (cwnd) more slowly - in response to partial acknowledgments. - Possible values are: - 0 increase cwnd once per acknowledgment (no ABC) - 1 increase cwnd once per acknowledgment of full sized segment - 2 allow increase cwnd by two if acknowledgment is - of two segments to compensate for delayed acknowledgments. - Default: 0 (off) - tcp_abort_on_overflow - BOOLEAN If listening service is too slow to accept new connections, reset them. Default state is FALSE. It means that if overflow diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e1d2283e3cc..6d0d46138ae8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -246,7 +246,6 @@ struct tcp_sock { u32 sacked_out; /* SACK'd packets */ u32 fackets_out; /* FACK'd packets */ u32 tso_deferred; - u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */ /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; diff --git a/include/net/tcp.h b/include/net/tcp.h index 614af8b7758e..23f2e98d4b65 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -279,7 +279,6 @@ extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; -extern int sysctl_tcp_abc; extern int sysctl_tcp_mtu_probing; extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 5a6384450501..b669ca1fa103 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_ABC, "tcp_abc" }, { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2622707602d1..960fd29d9b8e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -632,13 +632,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_abc", - .data = &sysctl_tcp_abc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_mtu_probing", .data = &sysctl_tcp_mtu_probing, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ec1f69c5ceb..2c7e5963c2ea 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2289,7 +2289,6 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index cdf2e707bb10..019c2389a341 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -317,28 +317,11 @@ void tcp_slow_start(struct tcp_sock *tp) snd_cwnd = 1U; } - /* RFC3465: ABC Slow start - * Increase only after a full MSS of bytes is acked - * - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) - return; - if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ else cnt = snd_cwnd; /* exponential increase */ - /* RFC3465: ABC - * We MAY increase by 2 if discovered delayed ack - */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) - cnt <<= 1; - tp->bytes_acked = 0; - tp->snd_cwnd_cnt += cnt; while (tp->snd_cwnd_cnt >= snd_cwnd) { tp->snd_cwnd_cnt -= snd_cwnd; @@ -378,20 +361,9 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - /* In dangerous area, increase slowly. */ - else if (sysctl_tcp_abc) { - /* RFC3465: Appropriate Byte Count - * increase once for each full cwnd acked - */ - if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { - tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } else { + else tcp_cong_avoid_ai(tp, tp->snd_cwnd); - } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e376aa9591bc..f56bd1082f54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -98,7 +98,6 @@ int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_abc __read_mostly; int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ @@ -2007,7 +2006,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->frto_counter = 0; - tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); @@ -2056,7 +2054,6 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->bytes_acked = 0; tcp_clear_retrans_partial(tp); if (tcp_is_reno(tp)) @@ -2684,7 +2681,6 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) struct tcp_sock *tp = tcp_sk(sk); tp->high_seq = tp->snd_nxt; - tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; @@ -2735,7 +2731,6 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; - tp->bytes_acked = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; tcp_init_cwnd_reduction(sk, set_ssthresh); @@ -3417,7 +3412,6 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; - tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } @@ -3609,15 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; - if (sysctl_tcp_abc) { - if (icsk->icsk_ca_state < TCP_CA_CWR) - tp->bytes_acked += ack - prior_snd_una; - else if (icsk->icsk_ca_state == TCP_CA_Loss) - /* we assume just one segment left network */ - tp->bytes_acked += min(ack - prior_snd_una, - tp->mss_cache); - } - prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f35f2dfb6401..f0409287b5f4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -446,7 +446,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; -- cgit v1.2.3