summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig2
-rw-r--r--net/ipv4/arp.c3
-rw-r--r--net/ipv4/cipso_ipv4.c11
-rw-r--r--net/ipv4/inet_connection_sock.c7
-rw-r--r--net/ipv4/inetpeer.c81
-rw-r--r--net/ipv4/ip_gre.c14
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv4/ping.c1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c12
-rw-r--r--net/ipv4/syncookies.c30
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c38
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cubic.c10
-rw-r--r--net/ipv4/tcp_input.c93
-rw-r--r--net/ipv4/tcp_ipv4.c17
-rw-r--r--net/ipv4/tcp_memcontrol.c6
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/xfrm4_mode_beet.c5
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c6
23 files changed, 238 insertions, 137 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index aa2a2c79776f..d183262943d9 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -409,7 +409,7 @@ config INET_TCP_DIAG
config INET_UDP_DIAG
tristate "UDP: socket monitoring interface"
- depends on INET_DIAG
+ depends on INET_DIAG && (IPV6 || IPV6=n)
default n
---help---
Support for UDP socket monitoring interface used by the ss tool.
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59402be133f0..63e49890ad31 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -863,7 +863,8 @@ static int arp_process(struct sk_buff *skb)
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
- pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 86f3b885b4f3..c48adc565e92 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1857,11 +1857,6 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
return CIPSO_V4_HDR_LEN + ret_val;
}
-static void opt_kfree_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct ip_options_rcu, rcu));
-}
-
/**
* cipso_v4_sock_setattr - Add a CIPSO option to a socket
* @sk: the socket
@@ -1938,7 +1933,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
}
rcu_assign_pointer(sk_inet->inet_opt, opt);
if (old)
- call_rcu(&old->rcu, opt_kfree_rcu);
+ kfree_rcu(old, rcu);
return 0;
@@ -2005,7 +2000,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
req_inet = inet_rsk(req);
opt = xchg(&req_inet->opt, opt);
if (opt)
- call_rcu(&opt->rcu, opt_kfree_rcu);
+ kfree_rcu(opt, rcu);
return 0;
@@ -2075,7 +2070,7 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
* remove the entire option struct */
*opt_ptr = NULL;
hdr_delta = opt->opt.optlen;
- call_rcu(&opt->rcu, opt_kfree_rcu);
+ kfree_rcu(opt, rcu);
}
return hdr_delta;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 2e4e24476c4c..19d66cefd7d3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -123,11 +123,14 @@ again:
smallest_size = tb->num_owners;
smallest_rover = rover;
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
- spin_unlock(&head->lock);
snum = smallest_rover;
- goto have_snum;
+ goto tb_found;
}
}
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+ snum = rover;
+ goto tb_found;
+ }
goto next;
}
break;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bf4a9c4808e1..d4d61b694fab 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
#include <net/secure_seq.h>
@@ -66,6 +67,11 @@
static struct kmem_cache *peer_cachep __read_mostly;
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
#define node_height(x) x->avl_height
#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+ struct inet_peer *p, *n;
+ LIST_HEAD(list);
+
+ spin_lock_bh(&gc_lock);
+ list_replace_init(&gc_list, &list);
+ spin_unlock_bh(&gc_lock);
+
+ if (list_empty(&list))
+ return;
+
+ list_for_each_entry_safe(p, n, &list, gc_list) {
+
+ if(need_resched())
+ cond_resched();
+
+ if (p->avl_left != peer_avl_empty) {
+ list_add_tail(&p->avl_left->gc_list, &list);
+ p->avl_left = peer_avl_empty;
+ }
+
+ if (p->avl_right != peer_avl_empty) {
+ list_add_tail(&p->avl_right->gc_list, &list);
+ p->avl_right = peer_avl_empty;
+ }
+
+ n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+ if (!atomic_read(&p->refcnt)) {
+ list_del(&p->gc_list);
+ kmem_cache_free(peer_cachep, p);
+ }
+ }
+
+ if (list_empty(&list))
+ return;
+
+ spin_lock_bh(&gc_lock);
+ list_splice(&list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
@@ -126,6 +176,7 @@ void __init inet_initpeers(void)
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
+ INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
}
static int addr_compare(const struct inetpeer_addr *a,
@@ -447,9 +498,8 @@ relookup:
p->rate_last = 0;
p->pmtu_expires = 0;
p->pmtu_orig = 0;
- p->redirect_genid = 0;
memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
-
+ INIT_LIST_HEAD(&p->gc_list);
/* Link the node. */
link_to_pool(p, base);
@@ -509,3 +559,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
return rc;
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+void inetpeer_invalidate_tree(int family)
+{
+ struct inet_peer *old, *new, *prev;
+ struct inet_peer_base *base = family_to_base(family);
+
+ write_seqlock_bh(&base->lock);
+
+ old = base->root;
+ if (old == peer_avl_empty_rcu)
+ goto out;
+
+ new = peer_avl_empty_rcu;
+
+ prev = cmpxchg(&base->root, old, new);
+ if (prev == old) {
+ base->total = 0;
+ spin_lock(&gc_lock);
+ list_add_tail(&prev->gc_list, &gc_list);
+ spin_unlock(&gc_lock);
+ schedule_delayed_work(&gc_work, gc_delay);
+ }
+
+out:
+ write_sequnlock_bh(&base->lock);
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2b53a1f7abf6..38673d2860e2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -65,7 +65,7 @@
it is infeasible task. The most general solutions would be
to keep skb->encapsulation counter (sort of local ttl),
and silently drop packet when it expires. It is a good
- solution, but it supposes maintaing new variable in ALL
+ solution, but it supposes maintaining new variable in ALL
skb, even if no tunneling is used.
Current solution: xmit_recursion breaks dead loops. This is a percpu
@@ -91,14 +91,14 @@
One of them is to parse packet trying to detect inner encapsulation
made by our node. It is difficult or even impossible, especially,
- taking into account fragmentation. TO be short, tt is not solution at all.
+ taking into account fragmentation. TO be short, ttl is not solution at all.
Current solution: The solution was UNEXPECTEDLY SIMPLE.
We force DF flag on tunnels with preconfigured hop limit,
that is ALL. :-) Well, it does not remove the problem completely,
but exponential growth of network traffic is changed to linear
(branches, that exceed pmtu are pruned) and tunnel mtu
- fastly degrades to value <68, where looping stops.
+ rapidly degrades to value <68, where looping stops.
Yes, it is not good if there exists a router in the loop,
which does not force DF, even when encapsulating packets have DF set.
But it is not our problem! Nobody could accuse us, we made
@@ -422,6 +422,10 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
if (register_netdevice(dev) < 0)
goto failed_free;
+ /* Can use a lockless transmit, unless we generate output sequences */
+ if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= NETIF_F_LLTX;
+
dev_hold(dev);
ipgre_tunnel_link(ign, nt);
return nt;
@@ -453,8 +457,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
GRE tunnels with enabled checksum. Tell them "thank you".
Well, I wonder, rfc1812 was written by Cisco employee,
- what the hell these idiots break standrads established
- by themself???
+ what the hell these idiots break standards established
+ by themselves???
*/
const struct iphdr *iph = (const struct iphdr *)skb->data;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1e60f7679075..42dd1a90edea 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb)
}
if (srrptr + 3 <= srrspace) {
opt->is_changed = 1;
- ip_rt_get_source(&optptr[srrptr-1], skb, rt);
ip_hdr(skb)->daddr = opt->nexthop;
+ ip_rt_get_source(&optptr[srrptr-1], skb, rt);
optptr[2] = srrptr+4;
} else if (net_ratelimit())
printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8aa87c19fa00..5343d9ac510b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -445,11 +445,6 @@ out:
}
-static void opt_kfree_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct ip_options_rcu, rcu));
-}
-
/*
* Socket option code for IP. This is the end of the line after any
* TCP,UDP etc options on an IP socket.
@@ -525,7 +520,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
rcu_assign_pointer(inet->inet_opt, opt);
if (old)
- call_rcu(&old->rcu, opt_kfree_rcu);
+ kfree_rcu(old, rcu);
break;
}
case IP_PKTINFO:
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index aea5a199c37a..b072386cee21 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -630,6 +630,7 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+ err = -EOPNOTSUPP;
if (flags & MSG_OOB)
goto out;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3569d8ecaeac..6afc807ee2ad 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -216,7 +216,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
- SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bcacf54e5418..019774796174 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -132,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
static int rt_chain_length_max __read_mostly = 20;
-static int redirect_genid;
static struct delayed_work expires_work;
static unsigned long expires_ljiffies;
@@ -937,7 +936,7 @@ static void rt_cache_invalidate(struct net *net)
get_random_bytes(&shuffle, sizeof(shuffle));
atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
- redirect_genid++;
+ inetpeer_invalidate_tree(AF_INET);
}
/*
@@ -1485,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
peer = rt->peer;
if (peer) {
- if (peer->redirect_learned.a4 != new_gw ||
- peer->redirect_genid != redirect_genid) {
+ if (peer->redirect_learned.a4 != new_gw) {
peer->redirect_learned.a4 = new_gw;
- peer->redirect_genid = redirect_genid;
atomic_inc(&__rt_peer_genid);
}
check_peer_redir(&rt->dst, peer);
@@ -1793,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt)
if (peer) {
check_peer_pmtu(&rt->dst, peer);
- if (peer->redirect_genid != redirect_genid)
- peer->redirect_learned.a4 = 0;
if (peer->redirect_learned.a4 &&
peer->redirect_learned.a4 != rt->rt_gateway)
check_peer_redir(&rt->dst, peer);
@@ -1958,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
dst_init_metrics(&rt->dst, peer->metrics, false);
check_peer_pmtu(&rt->dst, peer);
- if (peer->redirect_genid != redirect_genid)
- peer->redirect_learned.a4 = 0;
+
if (peer->redirect_learned.a4 &&
peer->redirect_learned.a4 != rt->rt_gateway) {
rt->rt_gateway = peer->redirect_learned.a4;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 51fdbb490437..eab2a7fb15d1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -278,6 +278,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct rtable *rt;
__u8 rcv_wscale;
bool ecn_ok = false;
+ struct flowi4 fl4;
if (!sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
@@ -346,20 +347,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- {
- struct flowi4 fl4;
-
- flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
- RT_SCOPE_UNIVERSE, IPPROTO_TCP,
- inet_sk_flowi_flags(sk),
- (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
- ireq->loc_addr, th->source, th->dest);
- security_req_classify_flow(req, flowi4_to_flowi(&fl4));
- rt = ip_route_output_key(sock_net(sk), &fl4);
- if (IS_ERR(rt)) {
- reqsk_free(req);
- goto out;
- }
+ flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
+ RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+ inet_sk_flowi_flags(sk),
+ (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
+ ireq->loc_addr, th->source, th->dest);
+ security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(sock_net(sk), &fl4);
+ if (IS_ERR(rt)) {
+ reqsk_free(req);
+ goto out;
}
/* Try to redo what tcp_v4_send_synack did. */
@@ -373,5 +370,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->rcv_wscale = rcv_wscale;
ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ /* ip_queue_xmit() depends on our flow being setup
+ * Normal sockets get it right from inet_csk_route_child_sock()
+ */
+ if (ret)
+ inet_sk(ret)->cork.fl.u.ip4 = fl4;
out: return ret;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4aa7e9dc0cbb..7a7724da9bff 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
static __net_init int ipv4_sysctl_init_net(struct net *net)
{
struct ctl_table *table;
- unsigned long limit;
table = ipv4_net_table;
if (!net_eq(net, &init_net)) {
@@ -814,11 +813,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
net->ipv4.sysctl_rt_cache_rebuild_count = 4;
- limit = nr_free_buffer_pages() / 8;
- limit = max(limit, 128UL);
- net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
- net->ipv4.sysctl_tcp_mem[1] = limit;
- net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+ tcp_init_mem(net);
net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
net_ipv4_ctl_path, table);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9bcdec3ad772..22ef5f9fd2ff 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1876,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how)
}
EXPORT_SYMBOL(tcp_shutdown);
+bool tcp_check_oom(struct sock *sk, int shift)
+{
+ bool too_many_orphans, out_of_socket_memory;
+
+ too_many_orphans = tcp_too_many_orphans(sk, shift);
+ out_of_socket_memory = tcp_out_of_memory(sk);
+
+ if (too_many_orphans && net_ratelimit())
+ pr_info("TCP: too many orphaned sockets\n");
+ if (out_of_socket_memory && net_ratelimit())
+ pr_info("TCP: out of memory -- consider tuning tcp_mem\n");
+ return too_many_orphans || out_of_socket_memory;
+}
+
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
@@ -2015,10 +2029,7 @@ adjudge_to_death:
}
if (sk->sk_state != TCP_CLOSE) {
sk_mem_reclaim(sk);
- if (tcp_too_many_orphans(sk, 0)) {
- if (net_ratelimit())
- printk(KERN_INFO "TCP: too many of orphaned "
- "sockets\n");
+ if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(sock_net(sk),
@@ -3216,11 +3227,21 @@ static int __init set_thash_entries(char *str)
}
__setup("thash_entries=", set_thash_entries);
+void tcp_init_mem(struct net *net)
+{
+ unsigned long limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
+ net->ipv4.sysctl_tcp_mem[1] = limit;
+ net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+}
+
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long limit;
- int i, max_share, cnt;
+ int max_share, cnt;
+ unsigned int i;
unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3263,7 +3284,7 @@ void __init tcp_init(void)
&tcp_hashinfo.bhash_size,
NULL,
64 * 1024);
- tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+ tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
@@ -3276,9 +3297,10 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans = cnt / 2;
sysctl_max_syn_backlog = max(128, cnt / 256);
+ tcp_init_mem(&init_net);
/* Set per-socket limits to no more than 1/128 the pressure threshold */
- limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1])
- << (PAGE_SHIFT - 7);
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10);
+ limit = max(limit, 128UL);
max_share = min(4UL*1024*1024, limit);
sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 6187eb4d1dcf..f45e1c242440 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -63,7 +63,6 @@ static inline void bictcp_reset(struct bictcp *ca)
{
ca->cnt = 0;
ca->last_max_cwnd = 0;
- ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
ca->epoch_start = 0;
@@ -72,7 +71,11 @@ static inline void bictcp_reset(struct bictcp *ca)
static void bictcp_init(struct sock *sk)
{
- bictcp_reset(inet_csk_ca(sk));
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
+
if (initial_ssthresh)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
@@ -127,7 +130,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
/* if in slow start or link utilization is very low */
- if (ca->loss_cwnd == 0) {
+ if (ca->last_max_cwnd == 0) {
if (ca->cnt > 20) /* increase cwnd 5% per RTT */
ca->cnt = 20;
}
@@ -185,7 +188,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct bictcp *ca = inet_csk_ca(sk);
- return max(tp->snd_cwnd, ca->last_max_cwnd);
+ return max(tp->snd_cwnd, ca->loss_cwnd);
}
static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index f376b05cca81..a9077f441cb2 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -107,7 +107,6 @@ static inline void bictcp_reset(struct bictcp *ca)
{
ca->cnt = 0;
ca->last_max_cwnd = 0;
- ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
ca->bic_origin_point = 0;
@@ -142,7 +141,10 @@ static inline void bictcp_hystart_reset(struct sock *sk)
static void bictcp_init(struct sock *sk)
{
- bictcp_reset(inet_csk_ca(sk));
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
if (hystart)
bictcp_hystart_reset(sk);
@@ -275,7 +277,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
* The initial growth of cubic function may be too conservative
* when the available bandwidth is still unknown.
*/
- if (ca->loss_cwnd == 0 && ca->cnt > 20)
+ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
ca->cnt = 20; /* increase cwnd 5% per RTT */
/* TCP Friendly */
@@ -342,7 +344,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
- return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}
static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2877c3e09587..b5e315f13641 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -105,7 +105,6 @@ int sysctl_tcp_abc __read_mostly;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
-#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -1040,13 +1039,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
* These 6 states form finite state machine, controlled by the following events:
* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
- * 3. Loss detection event of one of three flavors:
+ * 3. Loss detection event of two flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
- * A''. Its FACK modfication, head until snd.fack is lost.
- * B. SACK arrives sacking data transmitted after never retransmitted
- * hole was sent out.
- * C. SACK arrives sacking SND.NXT at the moment, when the
+ * A''. Its FACK modification, head until snd.fack is lost.
+ * B. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
*
@@ -1153,7 +1150,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
}
/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "C". Later note: FACK people cheated me again 8), we have to account
+ * Event "B". Later note: FACK people cheated me again 8), we have to account
* for reordering! Ugly, but should help.
*
* Search retransmitted skbs from write_queue that were sent when snd_nxt was
@@ -1310,25 +1307,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
return in_sack;
}
-static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
- struct tcp_sacktag_state *state,
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+ struct tcp_sacktag_state *state, u8 sacked,
+ u32 start_seq, u32 end_seq,
int dup_sack, int pcount)
{
struct tcp_sock *tp = tcp_sk(sk);
- u8 sacked = TCP_SKB_CB(skb)->sacked;
int fack_count = state->fack_count;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans &&
- after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+ after(end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
state->reord = min(fack_count, state->reord);
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ if (!after(end_seq, tp->snd_una))
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
@@ -1347,13 +1345,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
- if (before(TCP_SKB_CB(skb)->seq,
+ if (before(start_seq,
tcp_highest_sack_seq(tp)))
state->reord = min(fack_count,
state->reord);
/* SACK enhanced F-RTO (RFC4138; Appendix B) */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
+ if (!after(end_seq, tp->frto_highmark))
state->flag |= FLAG_ONLY_ORIG_SACKED;
}
@@ -1371,8 +1369,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->lost_skb_hint)->seq))
+ before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
if (fack_count > tp->fackets_out)
@@ -1391,6 +1388,9 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
return sacked;
}
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
@@ -1398,9 +1398,20 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+ u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
+ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
BUG_ON(!pcount);
+ /* Adjust counters and hints for the newly sacked sequence
+ * range but discard the return value since prev is already
+ * marked. We must tag the range first because the seq
+ * advancement below implicitly advances
+ * tcp_highest_sack_seq() when skb is highest_sack.
+ */
+ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+ start_seq, end_seq, dup_sack, pcount);
+
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@@ -1427,9 +1438,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_type = 0;
}
- /* We discard results */
- tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
-
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1577,6 +1585,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
}
}
+ /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
if (!skb_shift(prev, skb, len))
goto fallback;
if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
@@ -1667,10 +1679,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
break;
if (in_sack) {
- TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
- state,
- dup_sack,
- tcp_skb_pcount(skb));
+ TCP_SKB_CB(skb)->sacked =
+ tcp_sacktag_one(sk,
+ state,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ dup_sack,
+ tcp_skb_pcount(skb));
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1844,10 +1860,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (found_dup_sack && ((i + 1) == first_sack_index))
next_dup = &sp[i + 1];
- /* Event "B" in the comment above. */
- if (after(end_seq, tp->high_seq))
- state.flag |= FLAG_DATA_LOST;
-
/* Skip too early cached blocks */
while (tcp_sack_cache_ok(tp, cache) &&
!before(start_seq, cache->end_seq))
@@ -2515,8 +2527,11 @@ static void tcp_timeout_skbs(struct sock *sk)
tcp_verify_left_out(tp);
}
-/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
- * is against sacked "cnt", otherwise it's against facked "cnt"
+/* Detect loss in event "A" above by marking head of queue up as lost.
+ * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * are considered lost. For RFC3517 SACK, a segment is considered lost if it
+ * has at least tp->reordering SACKed seqments above it; "packets" refers to
+ * the maximum SACKed segments to pass before reaching this limit.
*/
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
@@ -2525,6 +2540,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
int cnt, oldcnt;
int err;
unsigned int mss;
+ /* Use SACK to deduce losses of new sequences sent during recovery */
+ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
@@ -2546,7 +2563,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
- if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+ if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
break;
oldcnt = cnt;
@@ -2556,6 +2573,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
if (cnt > packets) {
if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
@@ -3033,19 +3051,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (tcp_check_sack_reneging(sk, flag))
return;
- /* C. Process data loss notification, provided it is valid. */
- if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
- before(tp->snd_una, tp->high_seq) &&
- icsk->icsk_ca_state != TCP_CA_Open &&
- tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
- }
-
- /* D. Check consistency of the current state. */
+ /* C. Check consistency of the current state. */
tcp_verify_left_out(tp);
- /* E. Check state exit conditions. State can be terminated
+ /* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
@@ -3077,7 +3086,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
}
}
- /* F. Process state. */
+ /* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1eb4ad57670e..fd54c5f8a255 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -631,7 +631,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_len = sizeof(rep.th);
#ifdef CONFIG_TCP_MD5SIG
- key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
+ key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
if (key) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -651,6 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+ /* When socket is gone, all binding information is lost.
+ * routing might fail in this case. using iif for oif to
+ * make sure we can deliver it
+ */
+ arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos;
@@ -1461,9 +1466,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
- if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
- goto put_and_exit;
-
+ if (!dst) {
+ dst = inet_csk_route_child_sock(sk, newsk, req);
+ if (!dst)
+ goto put_and_exit;
+ } else {
+ /* syncookie case : see end of cookie_v4_check() */
+ }
sk_setup_caps(newsk, dst);
tcp_mtup_init(newsk);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index e714c6834c90..e795272fbe9e 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp)
val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
if (val != RESOURCE_MAX)
- jump_label_dec(&memcg_socket_limit_enabled);
+ static_key_slow_dec(&memcg_socket_limit_enabled);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
@@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
net->ipv4.sysctl_tcp_mem[i]);
if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
- jump_label_dec(&memcg_socket_limit_enabled);
+ static_key_slow_dec(&memcg_socket_limit_enabled);
else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
- jump_label_inc(&memcg_socket_limit_enabled);
+ static_key_slow_inc(&memcg_socket_limit_enabled);
return 0;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8de2780c7a..4ff3b6dc74fc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1141,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
sk_mem_uncharge(sk, len);
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- /* Any change of skb->len requires recalculation of tso
- * factor and mss.
- */
+ /* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
return 0;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a516d1e399df..cd2e0723266d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
if (sk->sk_err_soft)
shift++;
- if (tcp_too_many_orphans(sk, shift)) {
- if (net_ratelimit())
- printk(KERN_INFO "Out of socket memory\n");
-
+ if (tcp_check_oom(sk, shift)) {
/* Catch exceptional cases, when connection requires reset.
* 1. Last segment was sent recently. */
if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 63418185f524..e3db3f915114 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
skb_push(skb, sizeof(*iph));
skb_reset_network_header(skb);
-
- memmove(skb->data - skb->mac_len, skb_mac_header(skb),
- skb->mac_len);
- skb_set_mac_header(skb, -skb->mac_len);
+ skb_mac_header_rebuild(skb);
xfrm4_beet_make_header(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 534972e114ac..ed4bf11ef9f4 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -66,7 +66,6 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
- const unsigned char *old_mac;
int err = -EINVAL;
if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -84,10 +83,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
if (!(x->props.flags & XFRM_STATE_NOECN))
ipip_ecn_decapsulate(skb);
- old_mac = skb_mac_header(skb);
- skb_set_mac_header(skb, -skb->mac_len);
- memmove(skb_mac_header(skb), old_mac, skb->mac_len);
skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
err = 0;
out: