summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c235
1 files changed, 190 insertions, 45 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 06fe1cf645d5..50aaa1527150 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -253,6 +253,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
if (unlikely(len > icsk->icsk_ack.rcv_mss +
MAX_TCP_OPTION_SPACE))
tcp_gro_dev_warn(sk, skb, len);
+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
+ * set then it is likely the end of an application write. So
+ * more data may not be arriving soon, and yet the data sender
+ * may be waiting for an ACK if cwnd-bound or using TX zero
+ * copy. So we set ICSK_ACK_PUSHED here so that
+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
+ * reads all of the data and is not ping-pong. If len > MSS
+ * then this logic does not matter (and does not hurt) because
+ * tcp_cleanup_rbuf() will always ACK immediately if the app
+ * reads data and there is more than an MSS of unACKed data.
+ */
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -680,6 +693,23 @@ new_measure:
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
+static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
+{
+ u32 delta, delta_us;
+
+ delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
+ if (tp->tcp_usec_ts)
+ return delta;
+
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ if (!delta)
+ delta = 1;
+ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ return delta_us;
+ }
+ return -1;
+}
+
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
const struct sk_buff *skb)
{
@@ -691,15 +721,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
- u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
- u32 delta_us;
-
- if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
- if (!delta)
- delta = 1;
- delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- tcp_rcv_rtt_update(tp, delta_us, 0);
- }
+ s32 delta = tcp_rtt_tsopt_us(tp);
+
+ if (delta >= 0)
+ tcp_rcv_rtt_update(tp, delta, 0);
}
}
@@ -765,6 +790,16 @@ new_measure:
tp->rcvq_space.time = tp->tcp_mstamp;
}
+static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
+#endif
+}
+
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
* connection starts up, we want to ack as quickly as possible. The
@@ -813,6 +848,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
}
}
icsk->icsk_ack.lrcvtime = now;
+ tcp_save_lrcv_flowlabel(sk, skb);
tcp_ecn_check_ce(sk, skb);
@@ -927,8 +963,8 @@ static void tcp_update_pacing_rate(struct sock *sk)
* without any lock. We want to make sure compiler wont store
* intermediate values in this location.
*/
- WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
- sk->sk_max_pacing_rate));
+ WRITE_ONCE(sk->sk_pacing_rate,
+ min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -2088,6 +2124,10 @@ void tcp_clear_retrans(struct tcp_sock *tp)
tp->undo_marker = 0;
tp->undo_retrans = -1;
tp->sacked_out = 0;
+ tp->rto_stamp = 0;
+ tp->total_rto = 0;
+ tp->total_rto_recoveries = 0;
+ tp->total_rto_time = 0;
}
static inline void tcp_init_undo(struct tcp_sock *tp)
@@ -2194,16 +2234,17 @@ void tcp_enter_loss(struct sock *sk)
* restore sanity to the SACK scoreboard. If the apparent reneging
* persists until this RTO then we'll clear the SACK scoreboard.
*/
-static bool tcp_check_sack_reneging(struct sock *sk, int flag)
+static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
{
- if (flag & FLAG_SACK_RENEGING &&
- flag & FLAG_SND_UNA_ADVANCED) {
+ if (*ack_flag & FLAG_SACK_RENEGING &&
+ *ack_flag & FLAG_SND_UNA_ADVANCED) {
struct tcp_sock *tp = tcp_sk(sk);
unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
msecs_to_jiffies(10));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
delay, TCP_RTO_MAX);
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
return true;
}
return false;
@@ -2414,7 +2455,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
- tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
}
/* Nothing was retransmitted or returned timestamp is less
@@ -2825,6 +2866,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
+static void tcp_update_rto_time(struct tcp_sock *tp)
+{
+ if (tp->rto_stamp) {
+ tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
+ tp->rto_stamp = 0;
+ }
+}
+
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
@@ -2973,7 +3022,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tcp_check_sack_reneging(sk, flag))
+ if (tcp_check_sack_reneging(sk, ack_flag))
return;
/* C. Check consistency of the current state. */
@@ -3029,6 +3078,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, num_dupack, rexmit);
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_rto_time(tp);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
@@ -3108,17 +3159,10 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
- if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- flag & FLAG_ACKED) {
- u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
-
- if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
- if (!delta)
- delta = 1;
- seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- ca_rtt_us = seq_rtt_us;
- }
- }
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
+ tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
+ seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
+
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
return false;
@@ -3528,6 +3572,21 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
(ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
}
+static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held((struct sock *)tp));
+ if (ao && ack < tp->snd_una)
+ ao->snd_sne++;
+#endif
+}
+
/* If we update tp->snd_una, also update tp->bytes_acked */
static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
@@ -3535,9 +3594,25 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
sock_owned_by_me((struct sock *)tp);
tp->bytes_acked += delta;
+ tcp_snd_sne_update(tp, ack);
tp->snd_una = ack;
}
+static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held((struct sock *)tp));
+ if (ao && seq < tp->rcv_nxt)
+ ao->rcv_sne++;
+#endif
+}
+
/* If we update tp->rcv_nxt, also update tp->bytes_received */
static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
@@ -3545,6 +3620,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
sock_owned_by_me((struct sock *)tp);
tp->bytes_received += delta;
+ tcp_rcv_sne_update(tp, seq);
WRITE_ONCE(tp->rcv_nxt, seq);
}
@@ -4211,39 +4287,58 @@ static bool tcp_fast_parse_options(const struct net *net,
return true;
}
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/*
- * Parse MD5 Signature option
+ * Parse Signature options
*/
-const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+int tcp_do_parse_auth_options(const struct tcphdr *th,
+ const u8 **md5_hash, const u8 **ao_hash)
{
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
+ unsigned int minlen = TCPOLEN_MD5SIG;
+
+ if (IS_ENABLED(CONFIG_TCP_AO))
+ minlen = sizeof(struct tcp_ao_hdr) + 1;
+
+ *md5_hash = NULL;
+ *ao_hash = NULL;
/* If not enough data remaining, we can short cut */
- while (length >= TCPOLEN_MD5SIG) {
+ while (length >= minlen) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
- return NULL;
+ return 0;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2 || opsize > length)
- return NULL;
- if (opcode == TCPOPT_MD5SIG)
- return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+ return -EINVAL;
+ if (opcode == TCPOPT_MD5SIG) {
+ if (opsize != TCPOLEN_MD5SIG)
+ return -EINVAL;
+ if (unlikely(*md5_hash || *ao_hash))
+ return -EEXIST;
+ *md5_hash = ptr;
+ } else if (opcode == TCPOPT_AO) {
+ if (opsize <= sizeof(struct tcp_ao_hdr))
+ return -EINVAL;
+ if (unlikely(*md5_hash || *ao_hash))
+ return -EEXIST;
+ *ao_hash = ptr;
+ }
}
ptr += opsize - 2;
length -= opsize;
}
- return NULL;
+ return 0;
}
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
+EXPORT_SYMBOL(tcp_do_parse_auth_options);
#endif
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
@@ -4486,12 +4581,23 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
{
/* When the ACK path fails or drops most ACKs, the sender would
* timeout and spuriously retransmit the same segment repeatedly.
- * The receiver remembers and reflects via DSACKs. Leverage the
- * DSACK state and change the txhash to re-route speculatively.
+ * If it seems our ACKs are not reaching the other side,
+ * based on receiving a duplicate data segment with new flowlabel
+ * (suggesting the sender suffered an RTO), and we are not already
+ * repathing due to our own RTO, then rehash the socket to repath our
+ * packets.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
+ ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
sk_rethink_txhash(sk))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+
+ /* Save last flowlabel after a spurious retrans. */
+ tcp_save_lrcv_flowlabel(sk, skb);
+#endif
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4808,6 +4914,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
u32 seq, end_seq;
bool fragstolen;
+ tcp_save_lrcv_flowlabel(sk, skb);
tcp_ecn_check_ce(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
@@ -5553,6 +5660,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+ /* If we are running from __release_sock() in user context,
+ * Defer the ack until tcp_release_cb().
+ */
+ if (sock_owned_by_user_nocheck(sk) &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+ set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
+ return;
+ }
send_now:
tcp_send_ack(sk);
return;
@@ -6087,6 +6202,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_ao_finish_connect(sk, skb);
tcp_set_state(sk, TCP_ESTABLISHED);
icsk->icsk_ack.lrcvtime = tcp_jiffies32;
@@ -6235,7 +6351,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
- tcp_time_stamp(tp))) {
+ tcp_time_stamp_ts(tp))) {
NET_INC_STATS(sock_net(sk),
LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
@@ -6372,6 +6488,16 @@ consume:
* simultaneous connect with crossed SYNs.
* Particularly, it can be connect to self.
*/
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (ao) {
+ WRITE_ONCE(ao->risn, th->seq);
+ ao->rcv_sne = 0;
+ }
+#endif
tcp_set_state(sk, TCP_SYN_RECV);
if (tp->rx_opt.saw_tstamp) {
@@ -6436,22 +6562,24 @@ reset_and_undo:
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct request_sock *req;
/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
* undo. If peer SACKs triggered fast recovery, we can't undo here.
*/
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
- tcp_try_undo_loss(sk, false);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
+ tcp_try_undo_recovery(sk);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
- tcp_sk(sk)->retrans_stamp = 0;
+ tcp_update_rto_time(tp);
+ tp->retrans_stamp = 0;
inet_csk(sk)->icsk_retransmits = 0;
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
- req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ req = rcu_dereference_protected(tp->fastopen_rsk,
lockdep_sock_is_held(sk));
reqsk_fastopen_remove(sk, req, false);
@@ -6582,6 +6710,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
+ tcp_ao_established(sk);
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
@@ -6958,6 +7087,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct flowi fl;
u8 syncookies;
+#ifdef CONFIG_TCP_AO
+ const struct tcp_ao_hdr *aoh;
+#endif
+
syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
/* TW buckets are converted to open requests without
@@ -6982,6 +7115,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
req->syncookie = want_cookie;
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
+ tcp_rsk(req)->req_usec_ts = -1;
#if IS_ENABLED(CONFIG_MPTCP)
tcp_rsk(req)->is_mptcp = 0;
#endif
@@ -7043,6 +7177,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
inet_rsk(req)->ecn_ok = 0;
}
+#ifdef CONFIG_TCP_AO
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ goto drop_and_release; /* Invalid TCP options */
+ if (aoh) {
+ tcp_rsk(req)->maclen = aoh->length - sizeof(struct tcp_ao_hdr);
+ tcp_rsk(req)->ao_rcv_next = aoh->keyid;
+ tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+ } else {
+ tcp_rsk(req)->maclen = 0;
+ }
+#endif
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;