summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/ip_fragment.c183
-rw-r--r--net/ipv4/tcp_dctcp.c4
-rw-r--r--net/ipv4/tcp_input.c16
4 files changed, 154 insertions, 51 deletions
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6d258a5669e7..bcb11f3a27c0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -146,7 +146,7 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = xp;
} while (fp);
} else {
- sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7cb7ed761d8c..88281fbce88c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,6 +57,57 @@
*/
static const char ip_frag_cache_name[] = "ip4-frags";
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+ struct inet_skb_parm h;
+ struct sk_buff *next_frag;
+ int frag_run_len;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void ip4_frag_init_run(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+
+ FRAG_CB(skb)->next_frag = NULL;
+ FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
+ struct sk_buff *skb)
+{
+ RB_CLEAR_NODE(&skb->rbnode);
+ FRAG_CB(skb)->next_frag = NULL;
+
+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+ FRAG_CB(q->fragments_tail)->next_frag = skb;
+ q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+ if (q->last_run_head)
+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+ &q->last_run_head->rbnode.rb_right);
+ else
+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+ ip4_frag_init_run(skb);
+ q->fragments_tail = skb;
+ q->last_run_head = skb;
+}
+
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
struct inet_frag_queue q;
@@ -75,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
- struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -168,7 +219,12 @@ static void ip_expire(struct timer_list *t)
head = skb_rb_first(&qp->q.rb_fragments);
if (!head)
goto out;
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ if (FRAG_CB(head)->next_frag)
+ rb_replace_node(&head->rbnode,
+ &FRAG_CB(head)->next_frag->rbnode,
+ &qp->q.rb_fragments);
+ else
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
memset(&head->rbnode, 0, sizeof(head->rbnode));
barrier();
}
@@ -269,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
@@ -278,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp)
qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -289,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct rb_node **rbn, *parent;
- struct sk_buff *skb1;
+ struct sk_buff *skb1, *prev_tail;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -367,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
*/
/* Find out where to put this fragment. */
- skb1 = qp->q.fragments_tail;
- if (!skb1) {
- /* This is the first fragment we've received. */
- rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
- qp->q.fragments_tail = skb;
- } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
- /* This is the common/special case: skb goes to the end. */
+ prev_tail = qp->q.fragments_tail;
+ if (!prev_tail)
+ ip4_frag_create_run(&qp->q, skb); /* First fragment. */
+ else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+ /* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
- if (offset < (skb1->ip_defrag_offset + skb1->len))
+ if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
goto discard_qp;
- /* Insert after skb1. */
- rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
- qp->q.fragments_tail = skb;
+ if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+ ip4_frag_append_to_last_run(&qp->q, skb);
+ else
+ ip4_frag_create_run(&qp->q, skb);
} else {
- /* Binary search. Note that skb can become the first fragment, but
- * not the last (covered above). */
+ /* Binary search. Note that skb can become the first fragment,
+ * but not the last (covered above).
+ */
rbn = &qp->q.rb_fragments.rb_node;
do {
parent = *rbn;
skb1 = rb_to_skb(parent);
if (end <= skb1->ip_defrag_offset)
rbn = &parent->rb_left;
- else if (offset >= skb1->ip_defrag_offset + skb1->len)
+ else if (offset >= skb1->ip_defrag_offset +
+ FRAG_CB(skb1)->frag_run_len)
rbn = &parent->rb_right;
else /* Found an overlap with skb1. */
goto discard_qp;
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
- * one of its NULL left/right children. Insert skb. */
+ * one of its NULL left/right children. Insert skb.
+ */
+ ip4_frag_init_run(skb);
rb_link_node(&skb->rbnode, parent, rbn);
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
}
- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
if (dev)
qp->iif = dev->ifindex;
@@ -425,7 +485,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, skb, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev);
skb->_skb_refdst = orefdst;
return err;
}
@@ -444,7 +504,7 @@ err:
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
@@ -468,10 +528,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
goto out_nomem;
- rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+ if (RB_EMPTY_NODE(&skb->rbnode))
+ FRAG_CB(prev_tail)->next_frag = fp;
+ else
+ rb_replace_node(&skb->rbnode, &fp->rbnode,
+ &qp->q.rb_fragments);
if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
skb_morph(skb, head);
+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
rb_replace_node(&head->rbnode, &skb->rbnode,
&qp->q.rb_fragments);
consume_skb(head);
@@ -507,7 +573,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- skb->truesize += clone->truesize;
+ head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
@@ -520,24 +586,36 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
skb_push(head, head->data - skb_network_header(head));
/* Traverse the tree in order, to build frag_list. */
+ fp = FRAG_CB(head)->next_frag;
rbn = rb_next(&head->rbnode);
rb_erase(&head->rbnode, &qp->q.rb_fragments);
- while (rbn) {
- struct rb_node *rbnext = rb_next(rbn);
- fp = rb_to_skb(rbn);
- rb_erase(rbn, &qp->q.rb_fragments);
- rbn = rbnext;
- *nextp = fp;
- nextp = &fp->next;
- fp->prev = NULL;
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+ while (rbn || fp) {
+ /* fp points to the next sk_buff in the current run;
+ * rbn points to the next run.
+ */
+ /* Go through the current run. */
+ while (fp) {
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ fp = FRAG_CB(fp)->next_frag;
+ }
+ /* Move to the next run. */
+ if (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ }
}
sub_frag_mem_limit(qp->q.net, head->truesize);
@@ -573,6 +651,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
return 0;
out_nomem:
@@ -654,6 +733,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_check_defrag);
+unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ while (skb) {
+ struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+ sum += skb->truesize;
+ kfree_skb(skb);
+ skb = next;
+ }
+ }
+ return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
#ifdef CONFIG_SYSCTL
static int dist_min;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 8b637f9f23a2..ca61e2a659e7 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -136,7 +136,7 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
*/
if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
__tcp_send_ack(sk, ca->prior_rcv_nxt);
- tcp_enter_quickack_mode(sk, 1);
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -157,7 +157,7 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
*/
if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
__tcp_send_ack(sk, ca->prior_rcv_nxt);
- tcp_enter_quickack_mode(sk, 1);
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
ca->prior_rcv_nxt = tp->rcv_nxt;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 715d541b52dd..4c2dd9f863f7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -245,16 +245,16 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
-static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr) {
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
* cwnd may be very low (even just 1 packet), so we should ACK
* immediately.
*/
- tcp_enter_quickack_mode((struct sock *)tp, 2);
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
}
@@ -4703,7 +4703,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
- tcp_ecn_accept_cwr(tp, skb);
+ tcp_ecn_accept_cwr(sk, skb);
tp->rx_opt.dsack = 0;
@@ -4735,11 +4735,11 @@ queue_and_out:
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
- /* RFC2581. 4.2. SHOULD send immediate ACK, when
+ /* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
- inet_csk(sk)->icsk_ack.pingpong = 0;
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
@@ -5179,7 +5179,9 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(sk)) {
+ tcp_in_quickack_mode(sk) ||
+ /* Protocol state mandates a one-time immediate ACK */
+ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
send_now:
tcp_send_ack(sk);
return;