summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c404
1 files changed, 198 insertions, 206 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a906e0200ff2..a12b455928e5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -68,12 +68,12 @@
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
+#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
-#include <net/netdma.h>
#include <linux/errqueue.h>
int sysctl_tcp_timestamps __read_mostly = 1;
@@ -201,28 +201,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk)
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
-static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
+static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
if (tp->ecn_flags & TCP_ECN_OK)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
-static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
{
- if (!(tp->ecn_flags & TCP_ECN_OK))
- return;
-
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -233,30 +230,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
tcp_enter_quickack_mode((struct sock *)tp);
break;
case INET_ECN_CE:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode((struct sock *)tp);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
- /* fallinto */
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
default:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
}
}
-static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ if (tp->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(tp, skb);
+}
+
+static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
return true;
@@ -653,7 +663,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
}
icsk->icsk_ack.lrcvtime = now;
- TCP_ECN_check_ce(tp, skb);
+ tcp_ecn_check_ce(tp, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
@@ -1295,9 +1305,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
TCP_SKB_CB(prev)->end_seq += shifted;
TCP_SKB_CB(skb)->seq += shifted;
- skb_shinfo(prev)->gso_segs += pcount;
- BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
- skb_shinfo(skb)->gso_segs -= pcount;
+ tcp_skb_pcount_add(prev, pcount);
+ BUG_ON(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
/* When we're adding to gso_segs == 1, gso_size will be zero,
* in theory this shouldn't be necessary but as long as DSACK
@@ -1310,7 +1320,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
- if (skb_shinfo(skb)->gso_segs <= 1) {
+ if (tcp_skb_pcount(skb) <= 1) {
skb_shinfo(skb)->gso_size = 0;
skb_shinfo(skb)->gso_type = 0;
}
@@ -1888,21 +1898,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
tp->sacked_out = 0;
}
-static void tcp_clear_retrans_partial(struct tcp_sock *tp)
+void tcp_clear_retrans(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
-
tp->undo_marker = 0;
tp->undo_retrans = -1;
+ tp->fackets_out = 0;
+ tp->sacked_out = 0;
}
-void tcp_clear_retrans(struct tcp_sock *tp)
+static inline void tcp_init_undo(struct tcp_sock *tp)
{
- tcp_clear_retrans_partial(tp);
-
- tp->fackets_out = 0;
- tp->sacked_out = 0;
+ tp->undo_marker = tp->snd_una;
+ /* Retransmission still in flight may cause DSACKs later. */
+ tp->undo_retrans = tp->retrans_out ? : -1;
}
/* Enter Loss state. If we detect SACK reneging, forget all SACK information
@@ -1925,18 +1935,18 @@ void tcp_enter_loss(struct sock *sk)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
+ tcp_init_undo(tp);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tcp_clear_retrans_partial(tp);
+ tp->retrans_out = 0;
+ tp->lost_out = 0;
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- tp->undo_marker = tp->snd_una;
-
skb = tcp_write_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
@@ -1950,9 +1960,6 @@ void tcp_enter_loss(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->undo_marker = 0;
-
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
@@ -1972,7 +1979,7 @@ void tcp_enter_loss(struct sock *sk)
sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
- TCP_ECN_queue_cwr(tp);
+ tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
@@ -2364,7 +2371,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
- TCP_ECN_withdraw_cwr(tp);
+ tcp_ecn_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2494,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tp->prr_delivered = 0;
tp->prr_out = 0;
tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
+ tcp_ecn_queue_cwr(tp);
}
static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
@@ -2671,8 +2678,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out ? : -1;
+ tcp_init_undo(tp);
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
@@ -2971,7 +2977,8 @@ void tcp_rearm_rto(struct sock *sk)
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
- const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+ const u32 rto_time_stamp =
+ tcp_skb_timestamp(skb) + rto;
s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
/* delta may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
@@ -3023,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
return packets_acked;
}
+static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+ u32 prior_snd_una)
+{
+ const struct skb_shared_info *shinfo;
+
+ /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
+ if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ return;
+
+ shinfo = skb_shinfo(skb);
+ if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+ between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+}
+
/* Remove acknowledged frames from the retransmission queue. If our packet
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
@@ -3046,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt.v64 = 0;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
- struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
u32 acked_pcount;
- if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
- between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
- __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
@@ -3067,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
fully_acked = false;
} else {
+ /* Speedup tcp_unlink_write_queue() and next loop */
+ prefetchw(skb->next);
acked_pcount = tcp_skb_pcount(skb);
}
- if (sacked & TCPCB_RETRANS) {
+ if (unlikely(sacked & TCPCB_RETRANS)) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
@@ -3101,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->tcp_flags & TCPHDR_SYN)) {
+ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
@@ -3113,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- if (skb == tp->retransmit_skb_hint)
+ if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
- if (skb == tp->lost_skb_hint)
+ if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
}
@@ -3126,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
- if (first_ackt.v64) {
+ if (likely(first_ackt.v64)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
}
@@ -3211,9 +3232,10 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
+ unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ when, TCP_RTO_MAX);
}
}
@@ -3364,6 +3386,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
}
}
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->in_ack_event)
+ icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3379,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int acked = 0; /* Number of packets newly acked */
long sack_rtt_us = -1L;
+ /* We very likely will need to access write queue head. */
+ prefetchw(sk->sk_write_queue.next);
+
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
@@ -3423,10 +3456,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
- tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+ tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
+ u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
@@ -3438,10 +3473,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt_us);
- if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
- tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+ tcp_in_ack_event(sk, ack_ev_flags);
}
/* We passed data and got it acked, remove any soft error
@@ -4063,6 +4103,44 @@ static void tcp_sack_remove(struct tcp_sock *tp)
tp->rx_opt.num_sacks = num_sacks;
}
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+ return true;
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4070,7 +4148,8 @@ static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
- struct sk_buff *skb;
+ struct sk_buff *skb, *tail;
+ bool fragstolen, eaten;
while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
@@ -4083,9 +4162,9 @@ static void tcp_ofo_queue(struct sock *sk)
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
+ __skb_unlink(skb, &tp->out_of_order_queue);
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
- __skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
}
@@ -4093,11 +4172,15 @@ static void tcp_ofo_queue(struct sock *sk)
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
- __skb_unlink(skb, &tp->out_of_order_queue);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- if (tcp_hdr(skb)->fin)
+ if (!eaten)
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
}
}
@@ -4124,53 +4207,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
return 0;
}
-/**
- * tcp_try_coalesce - try to merge skb to prior one
- * @sk: socket
- * @to: prior buffer
- * @from: buffer to add in queue
- * @fragstolen: pointer to boolean
- *
- * Before queueing skb @from after @to, try to merge them
- * to reduce overall memory use and queue lengths, if cost is small.
- * Packets in ofo or receive queues can stay a long time.
- * Better try to coalesce them right now to avoid future collapses.
- * Returns true if caller should free @from instead of queueing it
- */
-static bool tcp_try_coalesce(struct sock *sk,
- struct sk_buff *to,
- struct sk_buff *from,
- bool *fragstolen)
-{
- int delta;
-
- *fragstolen = false;
-
- if (tcp_hdr(from)->fin)
- return false;
-
- /* Its possible this segment overlaps with prior segment in queue */
- if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
- return false;
-
- if (!skb_try_coalesce(to, from, fragstolen, &delta))
- return false;
-
- atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
- TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
- TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
- return true;
-}
-
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb1;
u32 seq, end_seq;
- TCP_ECN_check_ce(tp, skb);
+ tcp_ecn_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4309,24 +4352,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
- struct sk_buff *skb = NULL;
- struct tcphdr *th;
+ struct sk_buff *skb;
bool fragstolen;
if (size == 0)
return 0;
- skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+ skb = alloc_skb(size, sk->sk_allocation);
if (!skb)
goto err;
- if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- th = (struct tcphdr *)skb_put(skb, sizeof(*th));
- skb_reset_transport_header(skb);
- memset(th, 0, sizeof(*th));
-
if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
goto err_free;
@@ -4334,7 +4372,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
- if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
+ if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4348,7 +4386,6 @@ err:
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
- const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
bool fragstolen = false;
@@ -4357,9 +4394,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto drop;
skb_dst_drop(skb);
- __skb_pull(skb, th->doff * 4);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
- TCP_ECN_accept_cwr(tp, skb);
+ tcp_ecn_accept_cwr(tp, skb);
tp->rx_opt.dsack = 0;
@@ -4401,7 +4438,7 @@ queue_and_out:
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
tcp_event_data_recv(sk, skb);
- if (th->fin)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
if (!skb_queue_empty(&tp->out_of_order_queue)) {
@@ -4516,7 +4553,7 @@ restart:
* - bloated or contains data before "start" or
* overlaps to the next one.
*/
- if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
+ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(skb->truesize) > skb->len ||
before(TCP_SKB_CB(skb)->seq, start))) {
end_of_skbs = false;
@@ -4535,30 +4572,18 @@ restart:
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
- if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
+ if (end_of_skbs ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
while (before(start, end)) {
+ int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
- unsigned int header = skb_headroom(skb);
- int copy = SKB_MAX_ORDER(header, 0);
- /* Too big header? This can happen with IPv6. */
- if (copy < 0)
- return;
- if (end - start < copy)
- copy = end - start;
- nskb = alloc_skb(copy + header, GFP_ATOMIC);
+ nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
return;
- skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
- skb_set_network_header(nskb, (skb_network_header(skb) -
- skb->head));
- skb_set_transport_header(nskb, (skb_transport_header(skb) -
- skb->head));
- skb_reserve(nskb, header);
- memcpy(nskb->head, skb->head, header);
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
__skb_queue_before(list, skb, nskb);
@@ -4582,8 +4607,7 @@ restart:
skb = tcp_collapse_one(sk, skb, list);
if (!skb ||
skb == tail ||
- tcp_hdr(skb)->syn ||
- tcp_hdr(skb)->fin)
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
}
}
@@ -4951,53 +4975,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
__tcp_checksum_complete_user(sk, skb);
}
-#ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
- int hlen)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int dma_cookie;
- bool copied_early = false;
-
- if (tp->ucopy.wakeup)
- return false;
-
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = net_dma_find_channel();
-
- if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
-
- dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
- skb, hlen,
- tp->ucopy.iov, chunk,
- tp->ucopy.pinned_list);
-
- if (dma_cookie < 0)
- goto out;
-
- tp->ucopy.dma_cookie = dma_cookie;
- copied_early = true;
-
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
-
- if ((tp->ucopy.len == 0) ||
- (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
- (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
- tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk);
- }
- } else if (chunk > 0) {
- tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk);
- }
-out:
- return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5177,27 +5154,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
} else {
int eaten = 0;
- int copied_early = 0;
bool fragstolen = false;
- if (tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
- if (tp->ucopy.task == current &&
- sock_owned_by_user(sk) &&
- tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
- copied_early = 1;
- eaten = 1;
- }
-#endif
- if (tp->ucopy.task == current &&
- sock_owned_by_user(sk) && !copied_early) {
- __set_current_state(TASK_RUNNING);
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len &&
+ sock_owned_by_user(sk)) {
+ __set_current_state(TASK_RUNNING);
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
- eaten = 1;
- }
- if (eaten) {
+ if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
@@ -5213,9 +5178,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ eaten = 1;
}
- if (copied_early)
- tcp_cleanup_rbuf(sk, skb->len);
}
if (!eaten) {
if (tcp_checksum_complete_user(sk, skb))
@@ -5252,14 +5216,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto no_ack;
}
- if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
- __tcp_ack_snd_check(sk, 0);
+ __tcp_ack_snd_check(sk, 0);
no_ack:
-#ifdef CONFIG_NET_DMA
- if (copied_early)
- __skb_queue_tail(&sk->sk_async_wait_queue, skb);
- else
-#endif
if (eaten)
kfree_skb_partial(skb, fragstolen);
sk->sk_data_ready(sk);
@@ -5453,7 +5411,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* state to ESTABLISHED..."
*/
- TCP_ECN_rcv_synack(tp, th);
+ tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -5572,7 +5530,7 @@ discard:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- TCP_ECN_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5902,6 +5860,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
#endif
}
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+ const struct sk_buff *skb,
+ const struct sock *listen_sk)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, need_ecn;
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ need_ecn = tcp_ca_needs_ecn(listen_sk);
+
+ if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+ else if (ect && need_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+}
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -5910,7 +5902,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct request_sock *req;
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = NULL;
- __u32 isn = TCP_SKB_CB(skb)->when;
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
bool want_cookie = false, fastopen;
struct flowi fl;
struct tcp_fastopen_cookie foc = { .len = -1 };
@@ -5962,7 +5954,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb, sock_net(sk));
+ tcp_ecn_create_request(req, skb, sk);
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);