summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig22
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c67
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/arp.c12
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c332
-rw-r--r--net/ipv4/esp4_offload.c106
-rw-r--r--net/ipv4/fib_frontend.c25
-rw-r--r--net/ipv4/fib_semantics.c85
-rw-r--r--net/ipv4/fib_trie.c195
-rw-r--r--net/ipv4/fou.c23
-rw-r--r--net/ipv4/icmp.c133
-rw-r--r--net/ipv4/igmp.c10
-rw-r--r--net/ipv4/inet_connection_sock.c284
-rw-r--r--net/ipv4/inet_diag.c75
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c53
-rw-r--r--net/ipv4/ip_sockglue.c80
-rw-r--r--net/ipv4/ip_tunnel.c10
-rw-r--r--net/ipv4/ip_tunnel_core.c12
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c4
-rw-r--r--net/ipv4/ipmr.c296
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/Kconfig14
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/arp_tables.c59
-rw-r--r--net/ipv4/netfilter/ip_tables.c65
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c44
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c11
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c4
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c19
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c18
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c151
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c15
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c45
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c20
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c163
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c236
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c23
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c4
-rw-r--r--net/ipv4/ping.c27
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c43
-rw-r--r--net/ipv4/raw_diag.c266
-rw-r--r--net/ipv4/route.c172
-rw-r--r--net/ipv4/syncookies.c24
-rw-r--r--net/ipv4/sysctl_net_ipv4.c126
-rw-r--r--net/ipv4/tcp.c220
-rw-r--r--net/ipv4/tcp_bbr.c32
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_cong.c14
-rw-r--r--net/ipv4/tcp_dctcp.c1
-rw-r--r--net/ipv4/tcp_fastopen.c57
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c10
-rw-r--r--net/ipv4/tcp_input.c349
-rw-r--r--net/ipv4/tcp_ipv4.c90
-rw-r--r--net/ipv4/tcp_lp.c1
-rw-r--r--net/ipv4/tcp_metrics.c32
-rw-r--r--net/ipv4/tcp_minisocks.c30
-rw-r--r--net/ipv4/tcp_output.c339
-rw-r--r--net/ipv4/tcp_recovery.c149
-rw-r--r--net/ipv4/tcp_scalable.c15
-rw-r--r--net/ipv4/tcp_timer.c15
-rw-r--r--net/ipv4/tcp_vegas.c1
-rw-r--r--net/ipv4/tcp_veno.c10
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c10
-rw-r--r--net/ipv4/udp.c373
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c6
-rw-r--r--net/ipv4/xfrm4_mode_transport.c4
-rw-r--r--net/ipv4/xfrm4_policy.c9
-rw-r--r--net/ipv4/xfrm4_protocol.c3
-rw-r--r--net/ipv4/xfrm4_state.c8
91 files changed, 3745 insertions, 1558 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index b54b3ca939db..91a2557942fa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
config NET_IP_TUNNEL
tristate
select DST_CACHE
+ select GRO_CELLS
default n
config NET_IPGRE
@@ -360,6 +361,19 @@ config INET_ESP
If unsure, say Y.
+config INET_ESP_OFFLOAD
+ tristate "IP: ESP transformation offload"
+ depends on INET_ESP
+ select XFRM_OFFLOAD
+ default n
+ ---help---
+ Support for ESP transformation offload. This makes sense
+ only if this system really does IPsec and want to do it
+ with high throughput. A typical desktop system does not
+ need it, even if it does IPsec.
+
+ If unsure, say N.
+
config INET_IPCOMP
tristate "IP: IPComp transformation"
select INET_XFRM_TUNNEL
@@ -430,6 +444,14 @@ config INET_UDP_DIAG
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
+config INET_RAW_DIAG
+ tristate "RAW: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for RAW socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
config INET_DIAG_DESTROY
bool "INET: allow privileged process to administratively close sockets"
depends on INET_DIAG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bc6a6c8b9bcd..c6d4238ff94a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
@@ -40,6 +41,7 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 215143246e4b..6b1fc6e4278e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -90,7 +90,7 @@
#include <linux/random.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/inet.h>
#include <linux/igmp.h>
@@ -374,8 +374,18 @@ lookup_protocol:
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err)
+ if (err) {
sk_common_release(sk);
+ goto out;
+ }
+ }
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
}
out:
return err;
@@ -469,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK &&
+ if (snum && snum < inet_prot_sock(net) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -560,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
* TCP 'magic' in here.
*/
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+ int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
int err;
long timeo;
- if (addr_len < sizeof(uaddr->sa_family))
- return -EINVAL;
+ /*
+ * uaddr can be NULL and addr_len can be 0 if:
+ * sk is a TCP fastopen active socket and
+ * TCP_FASTOPEN_CONNECT sockopt is set and
+ * we already have a valid cookie for this socket.
+ * In this case, user can call write() after connect().
+ * write() will invoke tcp_sendmsg_fastopen() which calls
+ * __inet_stream_connect().
+ */
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
- if (uaddr->sa_family == AF_UNSPEC) {
- err = sk->sk_prot->disconnect(sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto out;
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
}
switch (sock->state) {
@@ -583,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- err = -EALREADY;
+ if (inet_sk(sk)->defer_connect)
+ err = is_sendmsg ? -EINPROGRESS : -EISCONN;
+ else
+ err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
@@ -597,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
+ if (!err && inet_sk(sk)->defer_connect)
+ goto out;
+
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
@@ -652,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
lock_sock(sock->sk);
- err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
release_sock(sock->sk);
return err;
}
@@ -662,11 +689,12 @@ EXPORT_SYMBOL(inet_stream_connect);
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
-int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+int inet_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
struct sock *sk1 = sock->sk;
int err = -EINVAL;
- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
if (!sk2)
goto do_err;
@@ -1396,7 +1424,7 @@ out_unlock:
rcu_read_unlock();
out:
- NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -1460,8 +1488,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
int proto = iph->protocol;
int err = -ENOSYS;
- if (skb->encapsulation)
+ if (skb->encapsulation) {
+ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
skb_set_inner_network_header(skb, nhoff);
+ }
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
@@ -1690,6 +1720,9 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
+#ifdef CONFIG_SYSCTL
+ net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
return 0;
}
@@ -1821,8 +1854,6 @@ static int __init inet_init(void)
ip_init();
- tcp_v4_init();
-
/* Setup TCP slab cache for open requests. */
tcp_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
int ihl = ip_hdrlen(skb);
int ah_hlen = (ah->hdrlen + 2) << 2;
+ if (err)
+ goto out;
+
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, ihl);
icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..51b27ae09fbd 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1263,7 +1263,7 @@ void __init arp_init(void)
/*
* ax25 -> ASCII conversion
*/
-static char *ax2asc2(ax25_address *a, char *buf)
+static void ax2asc2(ax25_address *a, char *buf)
{
char c, *s;
int n;
@@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf)
*s++ = n + '0';
*s++ = '\0';
- if (*buf == '\0' || *buf == '-')
- return "*";
-
- return buf;
+ if (*buf == '\0' || *buf == '-') {
+ buf[0] = '*';
+ buf[1] = '\0';
+ }
}
#endif /* CONFIG_AX25 */
@@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
}
#endif
sprintf(tbuf, "%pI4", n->primary_key);
- seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n",
tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
read_unlock(&n->lock);
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 72d6f056d863..ae206163c273 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
goto validate_return_locked;
}
+ if (opt_iter + 1 == opt_len) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
tag_len = tag[1];
if (tag_len > (opt_len - opt_iter)) {
err_offset = opt_iter + 1;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 062a67ca9a21..cebedd545e5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -26,12 +26,13 @@
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -65,8 +66,6 @@
#include <net/net_namespace.h>
#include <net/addrconf.h>
-#include "fib_lookup.h"
-
static struct ipv4_devconf ipv4_devconf = {
.data = {
[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
#include <net/protocol.h>
#include <net/udp.h>
+#include <linux/highmem.h>
+
struct esp_skb_cb {
struct xfrm_skb_cb xfrm;
void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+ struct crypto_aead *aead = x->data;
+ int extralen = 0;
+ u8 *iv;
+ struct aead_request *req;
+ struct scatterlist *sg;
+
+ if (x->props.flags & XFRM_STATE_ESN)
+ extralen += sizeof(*extra);
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+
+ /* Unref skb_frag_pages in the src scatterlist if necessary.
+ * Skip the first sg which comes from skb->data.
+ */
+ if (req->src != req->dst)
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+ put_page(sg_page(sg));
+}
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
+ void *tmp;
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
- kfree(ESP_SKB_CB(skb)->tmp);
+ tmp = ESP_SKB_CB(skb)->tmp;
+ esp_ssg_unref(x, tmp);
+ kfree(tmp);
xfrm_output_resume(skb, err);
}
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
sizeof(__be32));
}
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+ struct ip_esp_hdr *esph,
+ struct esp_output_extra *extra)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ }
+
+ esph->spi = x->id.spi;
+
+ return esph;
+}
+
static void esp_output_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+ /* Fill padding... */
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = proto;
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
- int err;
struct esp_output_extra *extra;
+ int err = -ENOMEM;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
- struct scatterlist *sg;
+ struct scatterlist *sg, *dsg;
struct sk_buff *trailer;
+ struct page *page;
void *tmp;
u8 *iv;
u8 *tail;
+ u8 *vaddr;
int blksize;
int clen;
int alen;
@@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int nfrags;
int assoclen;
int extralen;
+ int tailen;
__be64 seqno;
+ __u8 proto = *skb_mac_header(skb);
/* skb is pure payload to encrypt */
@@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
plen = clen - skb->len - tfclen;
-
- err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
- if (err < 0)
- goto error;
- nfrags = err;
-
+ tailen = tfclen + plen + alen;
assoclen = sizeof(*esph);
extralen = 0;
@@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
assoclen += sizeof(__be32);
}
- tmp = esp_alloc_tmp(aead, nfrags, extralen);
- if (!tmp) {
- err = -ENOMEM;
- goto error;
- }
-
- extra = esp_tmp_extra(tmp);
- iv = esp_tmp_iv(aead, tmp, extralen);
- req = esp_tmp_req(aead, iv);
- sg = esp_req_sg(aead, req);
-
- /* Fill padding... */
- tail = skb_tail_pointer(trailer);
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = *skb_mac_header(skb);
- pskb_put(skb, trailer, clen - skb->len + alen);
-
- skb_push(skb, -skb_network_offset(skb));
- esph = ip_esp_hdr(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
+ esph = ip_esp_hdr(skb);
/* this is non-NULL only with UDP Encapsulation */
if (x->encap) {
@@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
uh = (struct udphdr *)esph;
uh->source = sport;
uh->dest = dport;
- uh->len = htons(skb->len - skb_transport_offset(skb));
+ uh->len = htons(skb->len + tailen
+ - skb_transport_offset(skb));
uh->check = 0;
switch (encap_type) {
@@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_UDP;
}
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ if (!skb_cloned(skb)) {
+ if (tailen <= skb_availroom(skb)) {
+ nfrags = 1;
+ trailer = skb;
+ tail = skb_tail_pointer(trailer);
- aead_request_set_callback(req, 0, esp_output_done, skb);
+ goto skip_cow;
+ } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+ && !skb_has_frag_list(skb)) {
+ int allocsize;
+ struct sock *sk = skb->sk;
+ struct page_frag *pfrag = &x->xfrag;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * encryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- extra->esphoff = (unsigned char *)esph -
- skb_transport_header(skb);
- esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
- extra->seqhi = esph->spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto cow;
+ }
+
+ page = pfrag->page;
+ get_page(page);
+
+ vaddr = kmap_atomic(page);
+
+ tail = vaddr + pfrag->offset;
+
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ kunmap_atomic(vaddr);
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+
+ __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+ tailen);
+ skb_shinfo(skb)->nr_frags = ++nfrags;
+
+ pfrag->offset = pfrag->offset + allocsize;
+ nfrags++;
+
+ skb->len += tailen;
+ skb->data_len += tailen;
+ skb->truesize += tailen;
+ if (sk)
+ atomic_add(tailen, &sk->sk_wmem_alloc);
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esph->spi = x->id.spi;
+
+ tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
+ if (!tmp) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = &sg[nfrags];
+
+ esph = esp_output_set_extra(skb, esph, extra);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ spin_unlock_bh(&x->lock);
+
+ goto skip_cow2;
+ }
}
+cow:
+ err = skb_cow_data(skb, tailen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+ tail = skb_tail_pointer(trailer);
+ esph = ip_esp_hdr(skb);
+
+skip_cow:
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ pskb_put(skb, trailer, clen - skb->len + alen);
+ skb_push(skb, -skb_network_offset(skb));
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
esph->spi = x->id.spi;
+ tmp = esp_alloc_tmp(aead, nfrags, extralen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = sg;
+
+ esph = esp_output_set_extra(skb, esph, extra);
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
(unsigned char *)esph - skb->data,
assoclen + ivlen + clen + alen);
- aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
aead_request_set_ad(req, assoclen);
seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
esp_output_restore_header(skb);
}
+ if (sg != dsg)
+ esp_ssg_unref(x, tmp);
kfree(tmp);
error:
@@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
__skb_pull(skb, 4);
}
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ }
+}
+
static void esp_input_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (elen <= 0)
goto out;
- err = skb_cow_data(skb, 0, &trailer);
- if (err < 0)
- goto out;
-
- nfrags = err;
-
assoclen = sizeof(*esph);
seqhilen = 0;
@@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
assoclen += seqhilen;
}
+ if (!skb_cloned(skb)) {
+ if (!skb_is_nonlinear(skb)) {
+ nfrags = 1;
+
+ goto skip_cow;
+ } else if (!skb_has_frag_list(skb)) {
+ nfrags = skb_shinfo(skb)->nr_frags;
+ nfrags++;
+
+ goto skip_cow;
+ }
+ }
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+
+ nfrags = err;
+
+skip_cow:
err = -ENOMEM;
tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
@@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
- skb->ip_summed = CHECKSUM_NONE;
+ esp_input_set_header(skb, seqhi);
- esph = (struct ip_esp_hdr *)skb->data;
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
- aead_request_set_callback(req, 0, esp_input_done, skb);
+ skb->ip_summed = CHECKSUM_NONE;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * decryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)skb_push(skb, 4);
- *seqhi = esph->spi;
- esph->spi = esph->seq_no;
- esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ if ((x->props.flags & XFRM_STATE_ESN))
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
- }
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ else
+ aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
aead_request_set_ad(req, assoclen);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 000000000000..1de442632406
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,106 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ struct xfrm_offload *xo;
+ struct xfrm_state *x;
+ __be32 seq;
+ __be32 spi;
+ int err;
+
+ skb_pull(skb, offset);
+
+ if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+ goto out;
+
+ err = secpath_set(skb);
+ if (err)
+ goto out;
+
+ if (skb->sp->len == XFRM_MAX_DEPTH)
+ goto out;
+
+ x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ip_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ goto out;
+
+ skb->sp->xvec[skb->sp->len++] = x;
+ skb->sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo) {
+ xfrm_state_put(x);
+ goto out;
+ }
+ xo->flags |= XFRM_GRO;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+ XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+ /* We don't need to handle errors from xfrm_input, it does all
+ * the error handling and frees the resources on error. */
+ xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+ return ERR_PTR(-EINPROGRESS);
+out:
+ skb_push(skb, offset);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
+}
+
+static const struct net_offload esp4_offload = {
+ .callbacks = {
+ .gro_receive = esp4_gro_receive,
+ },
+};
+
+static int __init esp4_offload_init(void)
+{
+ return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+ inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 161fc0f0d752..8f2133ffc2ff 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -14,7 +14,7 @@
*/
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
@@ -46,6 +46,7 @@
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -85,7 +86,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- if (id == RT_TABLE_LOCAL)
+ if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
alias = fib_new_table(net, RT_TABLE_MAIN);
tb = fib_trie_table(id, alias);
@@ -318,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
int ret, no_addr;
struct fib_result res;
struct flowi4 fl4;
- struct net *net;
+ struct net *net = dev_net(dev);
bool dev_match;
fl4.flowi4_oif = 0;
@@ -331,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
no_addr = idev->ifa_list == NULL;
@@ -338,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
trace_fib_validate_source(dev, &fl4);
- net = dev_net(dev);
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
- if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ if (!rpf && !fib_num_tclassid_users(net) &&
(dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
goto last_resort;
fib_combine_itag(itag, &res);
@@ -620,6 +621,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_UID] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -676,6 +679,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_mx_len = nla_len(attr);
break;
case RTA_MULTIPATH:
+ err = lwtunnel_valid_encap_type_attr(nla_data(attr),
+ nla_len(attr));
+ if (err < 0)
+ goto errout;
cfg->fc_mp = nla_data(attr);
cfg->fc_mp_len = nla_len(attr);
break;
@@ -690,6 +697,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
break;
case RTA_ENCAP_TYPE:
cfg->fc_encap_type = nla_get_u16(attr);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+ if (err < 0)
+ goto errout;
break;
}
}
@@ -1073,7 +1083,8 @@ static void nl_fib_input(struct sk_buff *skb)
net = sock_net(skb->sk);
nlh = nlmsg_hdr(skb);
- if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
+ skb->len < nlh->nlmsg_len ||
nlmsg_len(nlh) < sizeof(*frn))
return;
@@ -1218,6 +1229,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+ net->ipv4.fib_seq = 0;
+
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 388d3e21629b..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -13,7 +13,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
+EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
@@ -470,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
int ret;
change_nexthops(fi) {
@@ -502,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_ENCAP);
if (nla) {
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
struct nlattr *nla_entype;
nla_entype = nla_find(attrs, attrlen,
RTA_ENCAP_TYPE);
if (!nla_entype)
goto err_inval;
- if (cfg->fc_oif)
- dev = __dev_get_by_index(net, cfg->fc_oif);
- ret = lwtunnel_build_state(dev, nla_get_u16(
+
+ ret = lwtunnel_build_state(nla_get_u16(
nla_entype),
nla, AF_INET, cfg,
&lwtstate);
@@ -596,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(struct net *net, u16 encap_type,
+static int fib_encap_match(u16 encap_type,
struct nlattr *encap,
- int oif, const struct fib_nh *nh,
+ const struct fib_nh *nh,
const struct fib_config *cfg)
{
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
int ret, result = 0;
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- if (oif)
- dev = __dev_get_by_index(net, oif);
- ret = lwtunnel_build_state(dev, encap_type, encap,
+ ret = lwtunnel_build_state(encap_type, encap,
AF_INET, cfg, &lwtstate);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -622,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -633,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
if (cfg->fc_oif || cfg->fc_gw) {
if (cfg->fc_encap) {
- if (fib_encap_match(net, cfg->fc_encap_type,
- cfg->fc_encap, cfg->fc_oif,
- fi->fib_nh, cfg))
+ if (fib_encap_match(cfg->fc_encap_type,
+ cfg->fc_encap, fi->fib_nh, cfg))
return 1;
}
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1092,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_encap) {
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
goto err_inval;
- if (cfg->fc_oif)
- dev = __dev_get_by_index(net, cfg->fc_oif);
- err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ err = lwtunnel_build_state(cfg->fc_encap_type,
cfg->fc_encap, AF_INET, cfg,
&lwtstate);
if (err)
@@ -1278,8 +1268,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
goto nla_put_failure;
#endif
- if (fi->fib_nh->nh_lwtstate)
- lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
+ if (fi->fib_nh->nh_lwtstate &&
+ lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
+ goto nla_put_failure;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (fi->fib_nhs > 1) {
@@ -1315,8 +1306,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
#endif
- if (nh->nh_lwtstate)
- lwtunnel_fill_encap(skb, nh->nh_lwtstate);
+ if (nh->nh_lwtstate &&
+ lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
+ goto nla_put_failure;
+
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
} endfor_nexthops(fi);
@@ -1362,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
return ret;
}
+static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+ enum fib_event_type event_type)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+ struct fib_nh_notifier_info info = {
+ .fib_nh = fib_nh,
+ };
+
+ switch (event_type) {
+ case FIB_EVENT_NH_ADD:
+ if (fib_nh->nh_flags & RTNH_F_DEAD)
+ break;
+ if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ fib_nh->nh_flags & RTNH_F_LINKDOWN)
+ break;
+ return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
+ &info.info);
+ case FIB_EVENT_NH_DEL:
+ if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
+ (fib_nh->nh_flags & RTNH_F_DEAD))
+ return call_fib_notifiers(dev_net(fib_nh->nh_dev),
+ event_type, &info.info);
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
/* Event force Flags Description
* NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
@@ -1403,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
+ call_fib_nh_notifiers(nexthop_nh,
+ FIB_EVENT_NH_DEL);
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1433,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
}
/* Must be invoked inside of an RCU protected region. */
-void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
struct hlist_head *fa_head = res->fa_head;
@@ -1557,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
continue;
alive++;
nexthop_nh->nh_flags &= ~nh_flags;
+ call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
} endfor_nexthops(fi)
if (alive > 0) {
@@ -1617,8 +1643,13 @@ void fib_select_multipath(struct fib_result *res, int hash)
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, int mp_hash)
{
+ bool oif_check;
+
+ oif_check = (fl4->flowi4_oif == 0 ||
+ fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (res->fi->fib_nhs > 1 && oif_check) {
if (mp_hash < 0)
mp_hash = get_hash_from_flowi4(fl4) >> 1;
@@ -1628,7 +1659,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
#endif
if (!res->prefixlen &&
res->table->tb_num_default > 1 &&
- res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ res->type == RTN_UNICAST && oif_check)
fib_select_default(fl4, res);
if (!fl4->saddr)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e3665bf7a7f3..2f0d8233950f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
#define VERSION "0.409"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -84,31 +84,119 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static BLOCKING_NOTIFIER_HEAD(fib_chain);
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_notifier_info info;
+
+ if (net->ipv4.fib_has_custom_rules)
+ call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ };
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
-int register_fib_notifier(struct notifier_block *nb)
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
{
- return blocking_notifier_chain_register(&fib_chain, nb);
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+ fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
}
EXPORT_SYMBOL(register_fib_notifier);
int unregister_fib_notifier(struct notifier_block *nb)
{
- return blocking_notifier_chain_unregister(&fib_chain, nb);
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ net->ipv4.fib_seq++;
info->net = net;
- return blocking_notifier_call_chain(&fib_chain, event_type, info);
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
}
static int call_fib_entry_notifiers(struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id, u32 nlflags)
+ u8 tos, u8 type, u32 tb_id)
{
struct fib_entry_notifier_info info = {
.dst = dst,
@@ -117,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
.tos = tos,
.type = type,
.tb_id = tb_id,
- .nlflags = nlflags,
};
return call_fib_notifiers(net, event_type, &info.info);
}
@@ -1109,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg)
{
+ enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
@@ -1206,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
+
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1214,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
- key, plen, fi,
- new_fa->fa_tos, cfg->fc_type,
- tb->tb_id, cfg->fc_nlflags);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, nlflags);
-
goto succeeded;
}
/* Error if we find a perfect match which
@@ -1230,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
- if (cfg->fc_nlflags & NLM_F_APPEND)
+ if (cfg->fc_nlflags & NLM_F_APPEND) {
+ event = FIB_EVENT_ENTRY_APPEND;
nlflags |= NLM_F_APPEND;
- else
+ } else {
fa = fa_first;
+ }
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1262,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
- cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
+ call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
+ tb->tb_id);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
@@ -1564,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
return -ESRCH;
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete->fa_info, tos, cfg->fc_type,
- tb->tb_id, 0);
+ fa_to_delete->fa_info, tos,
+ fa_to_delete->fa_type, tb->tb_id);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1874,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
+ tb->tb_id != fa->tb_id) {
slen = fa->fa_slen;
continue;
}
@@ -1883,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
n->key,
KEYLENGTH - fa->fa_slen,
fi, fa->fa_tos, fa->fa_type,
- tb->tb_id, 0);
+ tb->tb_id);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1903,6 +1994,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+ struct fib_table *tb, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ call_fib_entry_notifier(nb, net, event_type, l->key,
+ KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+ fa->fa_type, fa->tb_id);
+ }
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+ struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ fib_leaf_notify(net, l, tb, nb, event_type);
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
+ }
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ fib_table_notify(net, tb, nb, event_type);
+ }
+}
+
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
@@ -2241,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"Basic info: size of leaf:"
- " %Zd bytes, size of tnode: %Zd bytes.\n",
+ " %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 030d1531e897..805f6607f8d9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -622,14 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
return err;
}
-static struct genl_family fou_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = FOU_GENL_NAME,
- .version = FOU_GENL_VERSION,
- .maxattr = FOU_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family fou_nl_family;
static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_PORT] = { .type = NLA_U16, },
@@ -831,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = {
},
};
+static struct genl_family fou_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = fou_nl_ops,
+ .n_ops = ARRAY_SIZE(fou_nl_ops),
+};
+
size_t fou_encap_hlen(struct ip_tunnel_encap *e)
{
return sizeof(struct udphdr);
@@ -1086,8 +1090,7 @@ static int __init fou_init(void)
if (ret)
goto exit;
- ret = genl_register_family_with_ops(&fou_nl_family,
- fou_nl_ops);
+ ret = genl_register_family(&fou_nl_family);
if (ret < 0)
goto unregister;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 48734ee6293f..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -91,7 +91,7 @@
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
return *this_cpu_ptr(net->ipv4.icmp_sk);
}
+/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- local_bh_disable();
-
sk = icmp_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
- local_bh_enable();
return NULL;
}
return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
static inline void icmp_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&sk->sk_lock.slock);
+ spin_unlock(&sk->sk_lock.slock);
}
int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
}
EXPORT_SYMBOL(icmp_global_allow);
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+ if (type > NR_ICMP_TYPES)
+ return true;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ return true;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ return true;
+
+ return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+ if (icmpv4_mask_allow(net, type, code))
+ return true;
+
+ if (icmp_global_allow())
+ return true;
+
+ return false;
+}
+
/*
* Send an ICMP frame.
*/
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
+ struct inet_peer *peer;
bool rc = true;
+ int vif;
- if (type > NR_ICMP_TYPES)
- goto out;
-
- /* Don't limit PMTU discovery. */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ if (icmpv4_mask_allow(net, type, code))
goto out;
/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out;
- /* Limit if icmp type is enabled in ratemask. */
- if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
- goto out;
-
- rc = false;
- if (icmp_global_allow()) {
- int vif = l3mdev_master_ifindex(dst->dev);
- struct inet_peer *peer;
-
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
- rc = inet_peer_xrlim_allow(peer,
- net->ipv4.sysctl_icmp_ratelimit);
- if (peer)
- inet_putpeer(peer);
- }
+ vif = l3mdev_master_ifindex(dst->dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+ rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
out:
return rc;
}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct inet_sock *inet;
__be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
+ int type = icmp_param->data.icmph.type;
+ int code = icmp_param->data.icmph.code;
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
+ /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* global icmp_msgs_per_sec */
+ if (!icmpv4_global_allow(net, type, code))
+ goto out_bh_enable;
+
sk = icmp_xmit_lock(net);
if (!sk)
- return;
+ goto out_bh_enable;
inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0;
@@ -425,6 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -432,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
- if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
- icmp_param->data.icmph.code))
+ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -473,6 +497,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
@@ -569,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
- struct icmp_bxm *icmp_param;
+ struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
struct flowi4 fl4;
@@ -646,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
- icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
- if (!icmp_param)
- return;
+ /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit */
+ if (!icmpv4_global_allow(net, type, code))
+ goto out_bh_enable;
sk = icmp_xmit_lock(net);
if (!sk)
- goto out_free;
+ goto out_bh_enable;
/*
* Construct source address and options.
@@ -679,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+ if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -687,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header.
*/
- icmp_param->data.icmph.type = type;
- icmp_param->data.icmph.code = code;
- icmp_param->data.icmph.un.gateway = info;
- icmp_param->data.icmph.checksum = 0;
- icmp_param->skb = skb_in;
- icmp_param->offset = skb_network_offset(skb_in);
+ icmp_param.data.icmph.type = type;
+ icmp_param.data.icmph.code = code;
+ icmp_param.data.icmph.un.gateway = info;
+ icmp_param.data.icmph.checksum = 0;
+ icmp_param.skb = skb_in;
+ icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
sk->sk_mark = mark;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param->replyopts.opt;
+ ipc.opt = &icmp_param.replyopts.opt;
ipc.tx_flags = 0;
ipc.ttl = 0;
ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
- type, code, icmp_param);
+ type, code, &icmp_param);
if (IS_ERR(rt))
goto out_unlock;
+ /* peer icmp_ratelimit */
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende;
@@ -714,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
- icmp_param->data_len = skb_in->len - icmp_param->offset;
- if (icmp_param->data_len > room)
- icmp_param->data_len = room;
- icmp_param->head_len = sizeof(struct icmphdr);
+ icmp_param.data_len = skb_in->len - icmp_param.offset;
+ if (icmp_param.data_len > room)
+ icmp_param.data_len = room;
+ icmp_param.head_len = sizeof(struct icmphdr);
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
-out_free:
- kfree(icmp_param);
+out_bh_enable:
+ local_bh_enable();
out:;
}
EXPORT_SYMBOL(icmp_send);
@@ -1045,12 +1074,12 @@ int icmp_rcv(struct sk_buff *skb)
if (success) {
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
}
drop:
kfree_skb(skb);
- return 0;
+ return NET_RX_DROP;
csum_error:
__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 15db786d50ed..44fd86de2823 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -72,7 +72,7 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -219,9 +219,14 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
int tv = prandom_u32() % in_dev->mr_maxdelay;
+ unsigned long exp = jiffies + tv + 2;
+
+ if (in_dev->mr_gq_running &&
+ time_after_eq(exp, (in_dev->mr_gq_timer).expires))
+ return;
in_dev->mr_gq_running = 1;
- if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ if (!mod_timer(&in_dev->mr_gq_timer, exp))
in_dev_hold(in_dev);
}
@@ -1167,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
psf->sf_crcount = im->crcount;
}
in_dev_put(pmc->interface);
+ kfree(pmc);
}
spin_unlock_bh(&im->lock);
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 61a9deec2993..5e313c1ac94f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ * only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ * and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+ const struct in6_addr *sk2_rcv_saddr6,
+ __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk1_ipv6only, bool sk2_ipv6only,
+ bool match_wildcard)
+{
+ int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
+ int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+ /* if both are mapped, treat as IPv4 */
+ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return 1;
+ if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+ }
+
+ if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+ return 1;
+
+ if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (sk2_rcv_saddr6 &&
+ ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
+ return 1;
+
+ return 0;
+}
+#endif
+
+/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
+ */
+static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk2_ipv6only, bool match_wildcard)
+{
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return 1;
+ if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+}
+
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+ bool match_wildcard)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+ inet6_rcv_saddr(sk2),
+ sk->sk_rcv_saddr,
+ sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk),
+ ipv6_only_sock(sk2),
+ match_wildcard);
+#endif
+ return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk2), match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
@@ -44,12 +124,13 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
-int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb, bool relax)
+static int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb,
+ bool relax, bool reuseport_ok)
{
struct sock *sk2;
- int reuse = sk->sk_reuse;
- int reuseport = sk->sk_reuseport;
+ bool reuse = sk->sk_reuse;
+ bool reuseport = !!sk->sk_reuseport && reuseport_ok;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
@@ -61,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
- !inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -71,53 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk,
rcu_access_pointer(sk->sk_reuseport_cb) ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
}
}
return sk2 != NULL;
}
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket. Returns with the
+ * inet_bind_hashbucket lock held.
*/
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
{
- bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, attempts = 5, port = snum;
- int smallest_size = -1, smallest_port;
+ int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
- kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
- if (port) {
-have_port:
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock_bh(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
- goto tb_found;
-
- goto tb_not_found;
- }
-again:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -141,8 +202,6 @@ other_half_scan:
* We do the opposite to not pollute connect() users.
*/
offset |= 1U;
- smallest_size = -1;
- smallest_port = low; /* avoid compiler warning */
other_parity_scan:
port = low + offset;
@@ -156,29 +215,17 @@ other_parity_scan:
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
- if (((tb->fastreuse > 0 && reuse) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) &&
- (tb->num_owners < smallest_size || smallest_size == -1)) {
- smallest_size = tb->num_owners;
- smallest_port = port;
- }
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
- goto tb_found;
+ if (!inet_csk_bind_conflict(sk, tb, false, false))
+ goto success;
goto next_port;
}
- goto tb_not_found;
+ tb = NULL;
+ goto success;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
}
- if (smallest_size != -1) {
- port = smallest_port;
- goto have_port;
- }
offset--;
if (!(offset & 1))
goto other_parity_scan;
@@ -188,8 +235,74 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
- return ret;
+ return NULL;
+success:
+ *port_ret = port;
+ *tb_ret = tb;
+ return head;
+}
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+ struct sock *sk)
+{
+ kuid_t uid = sock_i_uid(sk);
+
+ if (tb->fastreuseport <= 0)
+ return 0;
+ if (!sk->sk_reuseport)
+ return 0;
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ return 0;
+ if (!uid_eq(tb->fastuid, uid))
+ return 0;
+ /* We only need to check the rcv_saddr if this tb was once marked
+ * without fastreuseport and then was reset, as we can only know that
+ * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+ * owners list.
+ */
+ if (tb->fastreuseport == FASTREUSEPORT_ANY)
+ return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb->fast_sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+ &sk->sk_v6_rcv_saddr,
+ tb->fast_rcv_saddr,
+ sk->sk_rcv_saddr,
+ tb->fast_ipv6_only,
+ ipv6_only_sock(sk), true);
+#endif
+ return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+ ipv6_only_sock(sk), true);
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ int ret = 1, port = snum;
+ struct inet_bind_hashbucket *head;
+ struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb = NULL;
+ kuid_t uid = sock_i_uid(sk);
+
+ if (!port) {
+ head = inet_csk_find_open_port(sk, &tb, &port);
+ if (!head)
+ return ret;
+ if (!tb)
+ goto tb_not_found;
+ goto success;
+ }
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port)
+ goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
@@ -200,38 +313,54 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (((tb->fastreuse > 0 && reuse) ||
- (tb->fastreuseport > 0 &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size == -1)
+ if ((tb->fastreuse > 0 && reuse) ||
+ sk_reuseport_match(tb, sk))
goto success;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if ((reuse ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) &&
- smallest_size != -1 && --attempts >= 0) {
- spin_unlock_bh(&head->lock);
- goto again;
- }
+ if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
+ }
+success:
+ if (!hlist_empty(&tb->owners)) {
+ tb->fastreuse = reuse;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = FASTREUSEPORT_ANY;
+ tb->fastuid = uid;
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ } else {
+ tb->fastreuseport = 0;
}
+ } else {
if (!reuse)
tb->fastreuse = 0;
- if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
- tb->fastreuseport = 0;
- } else {
- tb->fastreuse = reuse;
if (sk->sk_reuseport) {
- tb->fastreuseport = 1;
- tb->fastuid = uid;
+ /* We didn't match or we don't have fastreuseport set on
+ * the tb, but we have sk_reuseport set on this socket
+ * and we know that there are no bind conflicts with
+ * this socket in this tb, so reset our tb's reuseport
+ * settings so that any subsequent sockets that match
+ * our current socket will be put on the fast path.
+ *
+ * If we reset we need to set FASTREUSEPORT_STRICT so we
+ * do extra checking for all subsequent sk_reuseport
+ * socks.
+ */
+ if (!sk_reuseport_match(tb, sk)) {
+ tb->fastreuseport = FASTREUSEPORT_STRICT;
+ tb->fastuid = uid;
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ }
} else {
tb->fastreuseport = 0;
}
}
-success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
@@ -295,7 +424,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
/*
* This will accept the next outstanding connection.
*/
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -415,7 +544,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -452,7 +581,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -707,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
- local_bh_disable();
percpu_counter_dec(sk->sk_prot->orphan_count);
- local_bh_enable();
+
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e4d16fc5bbb3..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -200,13 +200,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
goto errout;
+ /*
+ * RAW sockets might have user-defined protocols assigned,
+ * so report the one supplied on socket creation.
+ */
+ if (sk->sk_type == SOCK_RAW) {
+ if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+ goto errout;
+ }
+
if (!icsk) {
handler->idiag_get_info(sk, r, NULL);
goto out;
}
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
@@ -852,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
- int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
- bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ int i, num, s_i, s_num;
+ struct sock *sk;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@@ -863,16 +873,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN))
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
- struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
- spin_lock_bh(&ilb->lock);
+ spin_lock(&ilb->lock);
sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
@@ -892,26 +901,18 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (r->id.idiag_dport ||
- cb->args[3] > 0)
- goto next_listen;
-
if (inet_csk_diag_dump(sk, skb, cb, r,
bc, net_admin) < 0) {
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
goto done;
}
next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
++num;
}
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
}
skip_listen_ht:
cb->args[0] = 1;
@@ -921,13 +922,14 @@ skip_listen_ht:
if (!(idiag_states & ~TCPF_LISTEN))
goto out;
+#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct hlist_nulls_node *node;
- struct sock *sk;
-
- num = 0;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
if (hlist_nulls_empty(&head->chain))
continue;
@@ -935,9 +937,12 @@ skip_listen_ht:
if (i > s_i)
s_num = 0;
+next_chunk:
+ num = 0;
+ accum = 0;
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- int state, res;
+ int state;
if (!net_eq(sock_net(sk), net))
continue;
@@ -961,21 +966,35 @@ skip_listen_ht:
if (!inet_diag_bc_sk(bc, sk))
goto next_normal;
- res = sk_diag_fill(sk, skb, r,
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin);
- if (res < 0) {
- spin_unlock_bh(lock);
- goto done;
+ if (res < 0)
+ num = num_arr[idx];
}
-next_normal:
- ++num;
+ sock_gen_put(sk_arr[idx]);
}
-
- spin_unlock_bh(lock);
+ if (res < 0)
+ break;
cond_resched();
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
}
done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
- tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
{
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
- tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
- tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
static int inet_reuseport_add_sock(struct sock *sk,
- struct inet_listen_hashbucket *ilb,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+ struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
struct sock *sk2;
@@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
inet_csk(sk2)->icsk_bind_hash == tb &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
- saddr_same(sk, sk2, false))
+ inet_rcv_saddr_equal(sk, sk2, false))
return reuseport_add_sock(sk, sk2);
}
@@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
return 0;
}
-int __inet_hash(struct sock *sk, struct sock *osk,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
@@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
spin_lock(&ilb->lock);
if (sk->sk_reuseport) {
- err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ err = inet_reuseport_add_sock(sk, ilb);
if (err)
goto unlock;
}
@@ -503,7 +494,7 @@ int inet_hash(struct sock *sk)
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
+ err = __inet_hash(sk, NULL);
local_bh_enable();
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-void inet_twsk_purge(struct inet_hashinfo *hashinfo,
- struct inet_timewait_death_row *twdr, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
{
struct inet_timewait_sock *tw;
struct sock *sk;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbe7f72db9c1..b3cdeec85f1f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -198,6 +198,7 @@ static void ip_expire(unsigned long arg)
qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
+ rcu_read_lock();
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -207,7 +208,7 @@ static void ip_expire(unsigned long arg)
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
if (!inet_frag_evicting(&qp->q)) {
- struct sk_buff *head = qp->q.fragments;
+ struct sk_buff *clone, *head = qp->q.fragments;
const struct iphdr *iph;
int err;
@@ -216,32 +217,40 @@ static void ip_expire(unsigned long arg)
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
goto out;
- rcu_read_lock();
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
- goto out_rcu_unlock;
+ goto out;
+
/* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
if (err)
- goto out_rcu_unlock;
+ goto out;
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (frag_expire_skip_icmp(qp->user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
- goto out_rcu_unlock;
+ goto out;
+
+ clone = skb_clone(head, GFP_ATOMIC);
/* Send an ICMP "Fragment Reassembly Timeout" message. */
- icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
-out_rcu_unlock:
- rcu_read_unlock();
+ if (clone) {
+ spin_unlock(&qp->q.lock);
+ icmp_send(clone, ICMP_TIME_EXCEEDED,
+ ICMP_EXC_FRAGTIME, 0);
+ consume_skb(clone);
+ goto out_rcu_unlock;
+ }
}
out:
spin_unlock(&qp->q.lock);
+out_rcu_unlock:
+ rcu_read_unlock();
ipq_put(qp);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 576f705d8180..c9c1cb635d9a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -17,7 +17,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
@@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
-static int ipgre_net_id __read_mostly;
-static int gre_tap_net_id __read_mostly;
+static unsigned int ipgre_net_id __read_mostly;
+static unsigned int gre_tap_net_id __read_mostly;
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4d158ff1def1..93157f2f4758 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -15,7 +15,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 877bdb02e887..7a3fd25e8913 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -42,7 +42,7 @@
* Hirokazu Takahashi: sendfile() on UDP works now.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -74,6 +74,7 @@
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
@@ -221,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
- int res = dst_neigh_output(dst, neigh, skb);
+ int res;
+
+ sock_confirm_neigh(skb, neigh);
+ res = neigh_output(neigh, skb);
rcu_read_unlock_bh();
return res;
@@ -287,6 +291,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -305,6 +316,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ return dev_loopback_xmit(net, sk, skb);
+}
+
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
@@ -342,7 +367,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -358,7 +383,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
@@ -583,7 +608,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
*/
if (skb_has_frag_list(skb)) {
struct sk_buff *frag, *frag2;
- int first_len = skb_pagelen(skb);
+ unsigned int first_len = skb_pagelen(skb);
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -804,11 +829,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
struct msghdr *msg = from;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (copy_from_iter(to, len, &msg->msg_iter) != len)
+ if (!copy_from_iter_full(to, len, &msg->msg_iter))
return -EFAULT;
} else {
__wsum csum = 0;
- if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
+ if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
@@ -864,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->csum = 0;
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -936,9 +964,9 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
+ (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
@@ -1064,6 +1092,9 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ if ((flags & MSG_CONFIRM) && !skb_prev)
+ skb_set_dst_pending_confirm(skb, 1);
+
/*
* Put the packet on the pending queue.
*/
@@ -1594,7 +1625,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -1606,6 +1638,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_mark = fl4.flowi4_mark;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
if (unlikely(err)) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b8a2d63d1fb8..1d46d05efb0f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -44,7 +44,7 @@
#include <net/ip_fib.h>
#include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* SOL_IP control messages.
@@ -97,6 +97,17 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
}
+static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
+{
+ int val;
+
+ if (IPCB(skb)->frag_max_size == 0)
+ return;
+
+ val = IPCB(skb)->frag_max_size;
+ put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
+}
+
static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
int tlen, int offset)
{
@@ -105,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_COMPLETE)
return;
- if (offset != 0)
- csum = csum_sub(csum,
- csum_partial(skb_transport_header(skb) + tlen,
- offset, 0));
+ if (offset != 0) {
+ int tend_off = skb_transport_offset(skb) + tlen;
+ csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
+ }
put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
}
@@ -137,7 +148,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
__be16 *ports = (__be16 *)skb_transport_header(skb);
- if (skb_transport_offset(skb) + 4 > skb->len)
+ if (skb_transport_offset(skb) + 4 > (int)skb->len)
return;
/* All current transport protocols have the port numbers in the
@@ -153,10 +164,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
- int tlen, int offset)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(skb->sk);
+ struct inet_sock *inet = inet_sk(sk);
unsigned int flags = inet->cmsg_flags;
/* Ordered by supposed usage frequency */
@@ -218,6 +229,9 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
if (flags & IP_CMSG_CHECKSUM)
ip_cmsg_recv_checksum(msg, skb, tlen, offset);
+
+ if (flags & IP_CMSG_RECVFRAGSIZE)
+ ip_cmsg_recv_fragsize(msg, skb);
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
@@ -258,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
continue;
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
- err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = cmsg->cmsg_len - sizeof(struct cmsghdr);
/* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
@@ -474,16 +488,15 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
return false;
/* Support IP_PKTINFO on tstamp packets if requested, to correlate
- * timestamp with egress dev. Not possible for packets without dev
+ * timestamp with egress dev. Not possible for packets without iif
* or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
*/
- if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
- (!skb->dev))
+ info = PKTINFO_SKB_CB(skb);
+ if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
+ !info->ipi_ifindex)
return false;
- info = PKTINFO_SKB_CB(skb);
info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
- info->ipi_ifindex = skb->dev->ifindex;
return true;
}
@@ -577,6 +590,7 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE:
+ case IP_ROUTER_ALERT:
return true;
}
return false;
@@ -614,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_MULTICAST_LOOP:
case IP_RECVORIGDSTADDR:
case IP_CHECKSUM:
+ case IP_RECVFRAGSIZE:
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *) optval))
return -EFAULT;
@@ -726,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
}
break;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ goto e_inval;
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
+ break;
case IP_TOS: /* This sets both TOS and Precedence */
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
@@ -820,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
{
struct ip_mreqn mreq;
struct net_device *dev = NULL;
+ int midx;
if (sk->sk_type == SOCK_STREAM)
goto e_inval;
@@ -864,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = -EADDRNOTAVAIL;
if (!dev)
break;
+
+ midx = l3mdev_master_ifindex(dev);
+
dev_put(dev);
err = -EINVAL;
if (sk->sk_bound_dev_if &&
- mreq.imr_ifindex != sk->sk_bound_dev_if)
+ mreq.imr_ifindex != sk->sk_bound_dev_if &&
+ (!midx || midx != sk->sk_bound_dev_if))
break;
inet->mc_index = mreq.imr_ifindex;
@@ -1202,14 +1230,27 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* which has interface index (iif) as the first member of the
* underlying inet{6}_skb_parm struct. This code then overlays
* PKTINFO_SKB_CB and in_pktinfo also has iif as the first
- * element so the iif is picked up from the prior IPCB
+ * element so the iif is picked up from the prior IPCB. If iif
+ * is the loopback interface, then return the sending interface
+ * (e.g., process binds socket to eth0 for Tx which is
+ * redirected to loopback in the rtable/dst).
*/
+ if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+ pktinfo->ipi_ifindex = inet_iif(skb);
+
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- skb_dst_drop(skb);
+ /* We need to keep the dst for __ip_options_echo()
+ * We could restrict the test to opt.ts_needtime || opt.srr,
+ * but the following is good enough as IP options are not often used.
+ */
+ if (unlikely(IPCB(skb)->opt.optlen))
+ skb_dst_force(skb);
+ else
+ skb_dst_drop(skb);
}
int ip_setsockopt(struct sock *sk, int level,
@@ -1357,6 +1398,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_CHECKSUM:
val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
break;
+ case IP_RECVFRAGSIZE:
+ val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
+ break;
case IP_TOS:
val = inet->tos;
break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5719d6ba0824..823abaef006b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -358,6 +358,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
{
struct ip_tunnel *nt;
struct net_device *dev;
+ int t_hlen;
BUG_ON(!itn->fb_tunnel_dev);
dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
@@ -367,6 +368,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
dev->mtu = ip_tunnel_bind_dev(dev);
nt = netdev_priv(dev);
+ t_hlen = nt->hlen + sizeof(struct iphdr);
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
ip_tunnel_add(itn, nt);
return nt;
}
@@ -929,7 +933,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
- if (new_mtu < 68)
+ if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
if (new_mtu > max_mtu) {
@@ -990,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev)
}
EXPORT_SYMBOL(ip_tunnel_get_iflink);
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
@@ -1192,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev)
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
/* Do least required initialization, rest of init is done in tunnel_init call */
-void ip_tunnel_setup(struct net_device *dev, int net_id)
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
tunnel->ip_tnl_net_id = net_id;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index fed3d29f9eb3..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
/* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
+void ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
int i;
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
tot->rx_bytes += rx_bytes;
tot->tx_bytes += tx_bytes;
}
-
- return tot;
}
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
@@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
};
-static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
@@ -313,6 +311,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
.fill_encap = ip_tun_fill_encap_info,
.get_encap_size = ip_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
@@ -324,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
};
-static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip6_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
@@ -403,6 +402,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
.fill_encap = ip6_tun_fill_encap_info,
.get_encap_size = ip6_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
+ .owner = THIS_MODULE,
};
void __init ip_tunnel_core_init(void)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5d7944f394d9..8b14f1404c8f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -46,7 +46,7 @@
static struct rtnl_link_ops vti_link_ops __read_mostly;
-static int vti_net_id __read_mostly;
+static unsigned int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 071a785c65eb..dfb2ab2dd3c8 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,7 @@
#include <net/ipconfig.h>
#include <net/route.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/checksum.h>
#include <asm/processor.h>
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
while ((d = next)) {
next = d->next;
dev = d->dev;
- if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) {
+ if (d != ic_dev && !netdev_uses_dsa(dev)) {
pr_debug("IP-Config: Downing %s\n", dev->name);
dev_change_flags(dev, d->flags);
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c9392589c415..00d4229b6954 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -96,7 +96,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
@@ -121,7 +121,7 @@ static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-static int ipip_net_id __read_mostly;
+static unsigned int ipip_net_id __read_mostly;
static int ipip_tunnel_init(struct net_device *dev);
static struct rtnl_link_ops ipip_link_ops __read_mostly;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 27089f5ebbb1..b036e85e093b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,7 @@
*
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/capability.h>
#include <linux/errno.h>
@@ -137,6 +137,9 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp4));
+
err = fib_rules_lookup(net->ipv4.mr_rules_ops,
flowi4_to_flowi(flp4), 0, &arg);
if (err < 0)
@@ -163,7 +166,9 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
return -EINVAL;
}
- mrt = ipmr_get_table(rule->fr_net, rule->table);
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = ipmr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -294,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
}
#endif
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct mfc_cache_cmp_arg *cmparg = arg->key;
+ struct mfc_cache *c = (struct mfc_cache *)ptr;
+
+ return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+ cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+ .head_offset = offsetof(struct mfc_cache, mnode),
+ .key_offset = offsetof(struct mfc_cache, cmparg),
+ .key_len = sizeof(struct mfc_cache_cmp_arg),
+ .nelem_hint = 3,
+ .locks_mul = 1,
+ .obj_cmpfn = ipmr_hash_cmp,
+ .automatic_shrinking = true,
+};
+
static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
struct mr_table *mrt;
- unsigned int i;
/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -313,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
write_pnet(&mrt->net, net);
mrt->id = id;
- /* Forwarding cache */
- for (i = 0; i < MFC_LINES; i++)
- INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
+ rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
+ INIT_LIST_HEAD(&mrt->mfc_cache_list);
INIT_LIST_HEAD(&mrt->mfc_unres_queue);
setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -333,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
mroute_clean_tables(mrt, true);
+ rhltable_destroy(&mrt->mfc_hash);
kfree(mrt);
}
@@ -834,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
{
- int line = MFC_HASH(mcastgrp, origin);
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
- return c;
- }
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ return c;
+
return NULL;
}
@@ -848,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
int vifi)
{
- int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = htonl(INADDR_ANY),
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
- if (c->mfc_origin == htonl(INADDR_ANY) &&
- c->mfc_mcastgrp == htonl(INADDR_ANY) &&
- c->mfc_un.res.ttls[vifi] < 255)
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (c->mfc_un.res.ttls[vifi] < 255)
return c;
return NULL;
@@ -864,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
__be32 mcastgrp, int vifi)
{
- int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c, *proxy;
if (mcastgrp == htonl(INADDR_ANY))
goto skip;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
- if (c->mfc_origin == htonl(INADDR_ANY) &&
- c->mfc_mcastgrp == mcastgrp) {
- if (c->mfc_un.res.ttls[vifi] < 255)
- return c;
-
- /* It's ok if the vifi is part of the static tree */
- proxy = ipmr_cache_find_any_parent(mrt,
- c->mfc_parent);
- if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
- return c;
- }
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
skip:
return ipmr_cache_find_any_parent(mrt, vifi);
}
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+ __be32 origin, __be32 mcastgrp,
+ int parent)
+{
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin,
+ };
+ struct rhlist_head *tmp, *list;
+ struct mfc_cache *c;
+
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (parent == -1 || parent == c->mfc_parent)
+ return c;
+
+ return NULL;
+}
+
/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
@@ -1023,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
struct sk_buff *skb)
{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct mfc_cache *c;
bool found = false;
int err;
- struct mfc_cache *c;
- const struct iphdr *iph = ip_hdr(skb);
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1090,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
- int line;
- struct mfc_cache *c, *next;
+ struct mfc_cache *c;
- line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (!c)
+ return -ENOENT;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
- list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
- (parent == -1 || parent == c->mfc_parent)) {
- list_del_rcu(&c->list);
- mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
- return 0;
- }
- }
- return -ENOENT;
+ return 0;
}
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
struct mfcctl *mfc, int mrtsock, int parent)
{
- bool found = false;
- int line;
struct mfc_cache *uc, *c;
+ bool found;
+ int ret;
if (mfc->mfcc_parent >= MAXVIFS)
return -ENFILE;
- line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
- list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
- (parent == -1 || parent == c->mfc_parent)) {
- found = true;
- break;
- }
- }
-
- if (found) {
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (c) {
write_lock_bh(&mrt_lock);
c->mfc_parent = mfc->mfcc_parent;
ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1155,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
- list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-
+ ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+ ipmr_rht_params);
+ if (ret) {
+ pr_err("ipmr: rhtable insert error %d\n", ret);
+ ipmr_cache_free(c);
+ return ret;
+ }
+ list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
/* Check to see if we resolved a queued list. If so we
* need to send on the frames and tidy up.
*/
@@ -1186,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
- int i;
+ struct mfc_cache *c, *tmp;
LIST_HEAD(list);
- struct mfc_cache *c, *next;
+ int i;
/* Shut down all active vif entries */
for (i = 0; i < mrt->maxvif; i++) {
@@ -1199,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
unregister_netdevice_many(&list);
/* Wipe the cache */
- for (i = 0; i < MFC_LINES; i++) {
- list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (!all && (c->mfc_flags & MFC_STATIC))
- continue;
- list_del_rcu(&c->list);
- mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
- }
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (!all && (c->mfc_flags & MFC_STATIC))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
}
if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
spin_lock_bh(&mfc_unres_lock);
- list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
list_del(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
@@ -1228,7 +1278,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk);
struct mr_table *mrt;
- rtnl_lock();
+ ASSERT_RTNL();
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1239,7 +1289,6 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false);
}
}
- rtnl_unlock();
}
/* Socket options and virtual interface manipulation. The whole
@@ -1303,13 +1352,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES;
} else {
- /* We need to unlock here because mrtsock_destruct takes
- * care of rtnl itself and we can't change that due to
- * the IP_ROUTER_ALERT setsockopt which runs without it.
- */
- rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL);
- goto out;
+ goto out_unlock;
}
break;
case MRT_ADD_VIF:
@@ -1420,7 +1464,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
}
out_unlock:
rtnl_unlock();
-out:
return ret;
}
@@ -1786,9 +1829,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *cache,
int local)
{
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
int psend = -1;
int vif, ct;
- int true_vifi = ipmr_find_vif(mrt, skb->dev);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
@@ -1809,6 +1852,12 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
/* Wrong interface: drop packet and (maybe) send PIM assert. */
if (mrt->vif_table[vif].dev != skb->dev) {
+ struct net_device *mdev;
+
+ mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
+ if (mdev == skb->dev)
+ goto forward;
+
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -2053,7 +2102,7 @@ static int pim_rcv(struct sk_buff *skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
(pim->flags & PIM_NULL_REGISTER) ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
@@ -2080,8 +2129,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
- if (c->mfc_parent >= MAXVIFS)
+ if (c->mfc_parent >= MAXVIFS) {
+ rtm->rtm_flags |= RTNH_F_UNRESOLVED;
return -ENOENT;
+ }
if (VIF_EXISTS(mrt, c->mfc_parent) &&
nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
@@ -2123,7 +2174,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ipmr_get_route(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr,
- struct rtmsg *rtm, int nowait, u32 portid)
+ struct rtmsg *rtm, u32 portid)
{
struct mfc_cache *cache;
struct mr_table *mrt;
@@ -2147,11 +2198,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
struct net_device *dev;
int vif = -1;
- if (nowait) {
- rcu_read_unlock();
- return -EAGAIN;
- }
-
dev = skb->dev;
read_lock(&mrt_lock);
if (dev)
@@ -2285,34 +2331,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
struct mr_table *mrt;
struct mfc_cache *mfc;
unsigned int t = 0, s_t;
- unsigned int h = 0, s_h;
unsigned int e = 0, s_e;
s_t = cb->args[0];
- s_h = cb->args[1];
- s_e = cb->args[2];
+ s_e = cb->args[1];
rcu_read_lock();
ipmr_for_each_table(mrt, net) {
if (t < s_t)
goto next_table;
- if (t > s_t)
- s_h = 0;
- for (h = s_h; h < MFC_LINES; h++) {
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
- if (e < s_e)
- goto next_entry;
- if (ipmr_fill_mroute(mrt, skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- mfc, RTM_NEWROUTE,
- NLM_F_MULTI) < 0)
- goto done;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
+ goto done;
next_entry:
- e++;
- }
- e = s_e = 0;
+ e++;
}
+ e = 0;
+ s_e = 0;
+
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
if (e < s_e)
@@ -2329,16 +2371,15 @@ next_entry2:
e++;
}
spin_unlock_bh(&mfc_unres_lock);
- e = s_e = 0;
- s_h = 0;
+ e = 0;
+ s_e = 0;
next_table:
t++;
}
done:
rcu_read_unlock();
- cb->args[2] = e;
- cb->args[1] = h;
+ cb->args[1] = e;
cb->args[0] = t;
return skb->len;
@@ -2548,7 +2589,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
const char *name = vif->dev ? vif->dev->name : "none";
seq_printf(seq,
- "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
vif - mrt->vif_table,
name, vif->bytes_in, vif->pkt_in,
vif->bytes_out, vif->pkt_out,
@@ -2582,10 +2623,8 @@ struct ipmr_mfc_iter {
struct seq_net_private p;
struct mr_table *mrt;
struct list_head *cache;
- int ct;
};
-
static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct ipmr_mfc_iter *it, loff_t pos)
{
@@ -2593,12 +2632,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct mfc_cache *mfc;
rcu_read_lock();
- for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
- it->cache = &mrt->mfc_cache_array[it->ct];
- list_for_each_entry_rcu(mfc, it->cache, list)
- if (pos-- == 0)
- return mfc;
- }
+ it->cache = &mrt->mfc_cache_list;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+ if (pos-- == 0)
+ return mfc;
rcu_read_unlock();
spin_lock_bh(&mfc_unres_lock);
@@ -2625,17 +2662,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
it->mrt = mrt;
it->cache = NULL;
- it->ct = 0;
return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
: SEQ_START_TOKEN;
}
static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct mfc_cache *mfc = v;
struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt = it->mrt;
+ struct mfc_cache *mfc = v;
++*pos;
@@ -2648,19 +2684,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (it->cache == &mrt->mfc_unres_queue)
goto end_of_list;
- BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
-
- while (++it->ct < MFC_LINES) {
- it->cache = &mrt->mfc_cache_array[it->ct];
- if (list_empty(it->cache))
- continue;
- return list_first_entry(it->cache, struct mfc_cache, list);
- }
-
/* exhausted cache_array, show unresolved */
rcu_read_unlock();
it->cache = &mrt->mfc_unres_queue;
- it->ct = 0;
spin_lock_bh(&mfc_unres_lock);
if (!list_empty(it->cache))
@@ -2680,7 +2706,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
- else if (it->cache == &mrt->mfc_cache_array[it->ct])
+ else if (it->cache == &mrt->mfc_cache_list)
rcu_read_unlock();
}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index b3cc1335adbc..c0cc6aa8cfaa 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
struct rtable *rt;
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
- __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ const struct sock *sk = skb_to_full_sk(skb);
+ __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
@@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
if (!fl4.flowi4_oif)
fl4.flowi4_oif = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
@@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
skb_dst_set(skb, NULL);
- dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_set(skb, dst);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d613309e3e5d..c11eb1744ab1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV4
To compile it as a module, choose M here. If unsure, say N.
+config NF_SOCKET_IPV4
+ tristate "IPv4 socket lookup support"
+ help
+ This option enables the IPv4 socket lookup infrastructure. This is
+ is required by the iptables socket match.
+
if NF_TABLES
config NF_TABLES_IPV4
@@ -54,6 +60,14 @@ config NFT_DUP_IPV4
help
This module enables IPv4 packet duplication support for nf_tables.
+config NFT_FIB_IPV4
+ select NFT_FIB
+ tristate "nf_tables fib / ip route lookup support"
+ help
+ This module enables IPv4 FIB lookups, e.g. for reverse path filtering.
+ It also allows query of the FIB for the route type, e.g. local, unicast,
+ multicast or blackhole.
+
endif # NF_TABLES_IPV4
config NF_TABLES_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 853328f8fd05..f462fee66ac8 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+
# logging
obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
@@ -34,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 697538464e6e..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -24,7 +24,7 @@
#include <linux/err.h>
#include <net/compat.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
@@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
*/
e = get_entry(table_base, private->hook_entry[hook]);
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.hooknum = hook;
- acpar.family = NFPROTO_ARP;
+ acpar.state = state;
acpar.hotdrop = false;
arp = arp_hdr(skb);
@@ -415,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name)
}
static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
- unsigned long pcnt;
int ret;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
@@ -443,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -523,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -532,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
static int translate_table(struct xt_table_info *newinfo, void *entry0,
const struct arpt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct arpt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -594,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size);
+ ret = find_check_entry(iter, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -681,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- /* ... then copy entire thing ... */
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -693,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct arpt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct arpt_entry, counters),
&counters[num],
@@ -702,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
}
t = arpt_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -809,7 +800,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -838,7 +829,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(NFPROTO_ARP);
@@ -863,7 +854,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
@@ -875,7 +866,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -902,8 +893,8 @@ static int __do_replace(struct net *net, const char *name,
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1018,8 +1009,8 @@ static int do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1408,7 +1399,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
@@ -1423,7 +1414,7 @@ static int compat_get_entries(struct net *net,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(NFPROTO_ARP);
return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7c00ce90adb8..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -20,7 +20,7 @@
#include <linux/icmp.h>
#include <net/ip.h>
#include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
@@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb,
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.family = NFPROTO_IPV4;
- acpar.hooknum = hook;
+ acpar.state = state;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
local_bh_disable();
@@ -535,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -543,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -586,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -674,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -683,6 +677,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ipt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -742,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -830,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -843,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct ipt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -856,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
i += m->u.match_size) {
m = (void *)e + i;
- if (copy_to_user(userptr + off + i
- + offsetof(struct xt_entry_match,
- u.user.name),
- m->u.kernel.match->name,
- strlen(m->u.kernel.match->name)+1)
- != 0) {
+ if (xt_match_to_user(m, userptr + off + i)) {
ret = -EFAULT;
goto free_counters;
}
}
t = ipt_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
@@ -977,7 +964,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1007,7 +994,7 @@ static int get_info(struct net *net, void __user *user,
xt_table_unlock(t);
module_put(t->me);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_unlock(AF_INET);
@@ -1032,7 +1019,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
@@ -1043,7 +1030,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
return ret;
}
@@ -1068,8 +1055,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1184,8 +1171,8 @@ do_add_counters(struct net *net, const void __user *user,
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET, tmp.name);
- if (IS_ERR_OR_NULL(t)) {
- ret = t ? PTR_ERR(t) : -ENOENT;
+ if (!t) {
+ ret = -ENOENT;
goto free;
}
@@ -1630,7 +1617,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (!IS_ERR_OR_NULL(t)) {
+ if (t) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
ret = compat_table_info(private, &info);
@@ -1644,7 +1631,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
module_put(t->me);
xt_table_unlock(t);
} else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ ret = -ENOENT;
xt_compat_unlock(AF_INET);
return ret;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4a9e6db9df8d..9b8841316e7b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -62,7 +62,7 @@ struct clusterip_config {
static const struct file_operations clusterip_proc_fops;
#endif
-static int clusterip_net_id __read_mostly;
+static unsigned int clusterip_net_id __read_mostly;
struct clusterip_net {
struct list_head configs;
@@ -144,6 +144,11 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
rcu_read_lock_bh();
c = __clusterip_config_find(net, clusterip);
if (c) {
+#ifdef CONFIG_PROC_FS
+ if (!c->pde)
+ c = NULL;
+ else
+#endif
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry)
@@ -166,14 +171,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c,
static struct clusterip_config *
clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
- struct net_device *dev)
+ struct net_device *dev)
{
+ struct net *net = dev_net(dev);
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
- return NULL;
+ return ERR_PTR(-ENOMEM);
c->dev = dev;
c->clusterip = ip;
@@ -185,6 +191,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
atomic_set(&c->refcount, 1);
atomic_set(&c->entries, 1);
+ spin_lock_bh(&cn->lock);
+ if (__clusterip_config_find(net, ip)) {
+ spin_unlock_bh(&cn->lock);
+ kfree(c);
+
+ return ERR_PTR(-EBUSY);
+ }
+
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
+
#ifdef CONFIG_PROC_FS
{
char buffer[16];
@@ -195,16 +212,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
+ spin_lock_bh(&cn->lock);
+ list_del_rcu(&c->list);
+ spin_unlock_bh(&cn->lock);
kfree(c);
- return NULL;
+
+ return ERR_PTR(-ENOMEM);
}
}
#endif
- spin_lock_bh(&cn->lock);
- list_add_rcu(&c->list, &cn->configs);
- spin_unlock_bh(&cn->lock);
-
return c;
}
@@ -410,16 +427,16 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
config = clusterip_config_init(cipinfo,
e->ip.dst.s_addr, dev);
- if (!config) {
+ if (IS_ERR(config)) {
dev_put(dev);
- return -ENOMEM;
+ return PTR_ERR(config);
}
dev_mc_add(config->dev, config->clustermac);
}
}
cipinfo->config = config;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -444,7 +461,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
clusterip_config_put(cipinfo->config);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
#ifdef CONFIG_COMPAT
@@ -468,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
.checkentry = clusterip_tg_check,
.destroy = clusterip_tg_destroy,
.targetsize = sizeof(struct ipt_clusterip_tgt_info),
+ .usersize = offsetof(struct ipt_clusterip_tgt_info, config),
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
#endif /* CONFIG_COMPAT */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index da7f02a0b868..a03e4e7ef5f9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
@@ -55,7 +55,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
- return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
+ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
+ xt_out(par));
+}
+
+static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target masquerade_tg_reg __read_mostly = {
@@ -66,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
+ .destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
};
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1d16c0f28df0..8bd0d7b26632 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,7 +34,7 @@ static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_reject_info *reject = par->targinfo;
- int hook = par->hooknum;
+ int hook = xt_hooknum(par);
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(par->net, skb, hook);
+ nf_send_reset(xt_net(par), skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index db5b87509446..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
goto free_nskb;
if (nfct) {
- nskb->nfct = nfct;
- nskb->nfctinfo = ctinfo;
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
nf_conntrack_get(nfct);
}
@@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static void
@@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static bool
@@ -263,12 +262,12 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct net *net = par->net;
+ struct net *net = xt_net(par);
struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
- if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
return NF_DROP;
th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
@@ -418,12 +417,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
e->ip.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg4_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 78cc64eddfc1..37fb9552e858 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
return dev_match || flags & XT_RPFILTER_LOOSE;
}
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
{
- const struct rtable *rt = skb_rtable(skb);
- return rt && (rt->rt_flags & RTCF_LOCAL);
+ return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
}
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -79,14 +79,16 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
info = par->matchinfo;
invert = info->flags & XT_RPFILTER_INVERT;
- if (rpfilter_is_local(skb))
+ if (rpfilter_is_loopback(skb, xt_in(par)))
return true ^ invert;
iph = ip_hdr(skb);
- if (ipv4_is_multicast(iph->daddr)) {
- if (ipv4_is_zeronet(iph->saddr))
- return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr))
+ return true ^ invert;
}
+
flow.flowi4_iif = LOOPBACK_IFINDEX;
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
@@ -95,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.flowi4_tos = RT_TOS(iph->tos);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
- return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
+ return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
}
static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 713c09a74b90..2e14ed11a35c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -31,6 +31,13 @@
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netfilter/nf_log.h>
+static int conntrack4_net_id __read_mostly;
+static DEFINE_MUTEX(register_ipv4_hooks);
+
+struct conntrack4_net {
+ unsigned int users;
+};
+
static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -158,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
+
+ if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
+ return NF_ACCEPT;
+
return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
}
@@ -228,7 +239,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
}
if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+ pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
*len, sizeof(struct sockaddr_in));
return -EINVAL;
}
@@ -307,9 +318,42 @@ static struct nf_sockopt_ops so_getorigdst = {
.owner = THIS_MODULE,
};
-static int ipv4_init_net(struct net *net)
+static int ipv4_hooks_register(struct net *net)
{
- return 0;
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv4_hooks);
+
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv4_enable(net);
+ if (err) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv4_hooks);
+ return err;
+}
+
+static void ipv4_hooks_unregister(struct net *net)
+{
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+
+ mutex_lock(&register_ipv4_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ mutex_unlock(&register_ipv4_hooks);
}
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
@@ -325,7 +369,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
.nla_policy = ipv4_nla_policy,
#endif
- .init_net = ipv4_init_net,
+ .net_ns_get = ipv4_hooks_register,
+ .net_ns_put = ipv4_hooks_unregister,
.me = THIS_MODULE,
};
@@ -336,52 +381,50 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("ip_conntrack");
MODULE_LICENSE("GPL");
+static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
+ &nf_conntrack_l4proto_tcp4,
+ &nf_conntrack_l4proto_udp4,
+ &nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite4,
+#endif
+};
+
static int ipv4_net_init(struct net *net)
{
int ret = 0;
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- pr_err("nf_conntrack_tcp4: pernet registration failed\n");
- goto out_tcp;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_udp4: pernet registration failed\n");
- goto out_udp;
- }
- ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_icmp4: pernet registration failed\n");
- goto out_icmp;
- }
+ ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
+ if (ret < 0)
+ return ret;
ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
if (ret < 0) {
pr_err("nf_conntrack_ipv4: pernet registration failed\n");
- goto out_ipv4;
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
- return 0;
-out_ipv4:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
-out_icmp:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
-out_udp:
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
-out_tcp:
return ret;
}
static void ipv4_net_exit(struct net *net)
{
nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
- nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
}
static struct pernet_operations ipv4_net_ops = {
.init = ipv4_net_init,
.exit = ipv4_net_exit,
+ .id = &conntrack4_net_id,
+ .size = sizeof(struct conntrack4_net),
};
static int __init nf_conntrack_l3proto_ipv4_init(void)
@@ -389,7 +432,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv4_enable();
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
@@ -403,46 +445,21 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
goto cleanup_sockopt;
}
- ret = nf_register_hooks(ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+ ret = nf_ct_l4proto_register(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
+ if (ret < 0)
goto cleanup_pernet;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
- goto cleanup_hooks;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
- goto cleanup_tcp4;
- }
-
- ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
- goto cleanup_udp4;
- }
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
- goto cleanup_icmpv4;
+ goto cleanup_l4proto;
}
return ret;
- cleanup_icmpv4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp4:
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- cleanup_hooks:
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+cleanup_l4proto:
+ nf_ct_l4proto_unregister(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
cleanup_pernet:
unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
@@ -454,10 +471,8 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
{
synchronize_net();
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ nf_ct_l4proto_unregister(builtin_l4proto4,
+ ARRAY_SIZE(builtin_l4proto4));
unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
static int
icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
unsigned int hooknum)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone;
+ enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
- NF_CT_ASSERT(skb->nfct == NULL);
+ NF_CT_ASSERT(!skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return -NF_ACCEPT;
}
- *ctinfo = IP_CT_RELATED;
+ ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(net, zone, &innertuple);
if (!h) {
@@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
}
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
+ ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */
- skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
+ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
return NF_ACCEPT;
}
@@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+ u8 pf, unsigned int hooknum)
{
const struct icmphdr *icmph;
struct icmphdr _ih;
@@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
- return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+ return icmp_error_message(net, tmpl, skb, hooknum);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index d88da36b383c..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -11,6 +11,7 @@
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <net/netns/generic.h>
#include <net/route.h>
#include <net/ip.h>
@@ -22,6 +23,8 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
+static DEFINE_MUTEX(defrag4_mutex);
+
static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
u_int32_t user)
{
@@ -42,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
{
u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct) {
+ if (skb_nfct(skb)) {
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
@@ -72,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
#if !IS_ENABLED(CONFIG_NF_NAT)
/* Previously seen (loopback)? Ignore. Do this before
fragment check. */
- if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
return NF_ACCEPT;
#endif
#endif
@@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
},
};
+static void __net_exit defrag4_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv4) {
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ net->nf.defrag_ipv4 = false;
+ }
+}
+
+static struct pernet_operations defrag4_net_ops = {
+ .exit = defrag4_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
- return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ return register_pernet_subsys(&defrag4_net_ops);
}
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ unregister_pernet_subsys(&defrag4_net_ops);
}
-void nf_defrag_ipv4_enable(void)
+int nf_defrag_ipv4_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv4)
+ return 0;
+
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv4 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag4_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
+ nf_reset(skb);
+ nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+ nf_conntrack_get(skb_nfct(skb));
#endif
/*
* If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..2f3895ddc275 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
if (ap == NULL) {
- nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+ nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
skb->len - sizeof(_arph));
return;
}
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f8aad03d674b..6f5e8d01b876 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
- /* We never see fragments: conntrack defrags on pre-routing
- * and local-out, and nf_nat_out protects post-routing.
- */
- NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
-
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
* have dropped it. Hence it's the user's responsibilty to
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c9b52c361da2..53e49f5011d3 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1260,16 +1260,6 @@ static const struct nf_conntrack_expect_policy snmp_exp_policy = {
.timeout = 180,
};
-static struct nf_conntrack_helper snmp_helper __read_mostly = {
- .me = THIS_MODULE,
- .help = help,
- .expect_policy = &snmp_exp_policy,
- .name = "snmp",
- .tuple.src.l3num = AF_INET,
- .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
- .tuple.dst.protonum = IPPROTO_UDP,
-};
-
static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
.me = THIS_MODULE,
.help = help,
@@ -1288,22 +1278,16 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
static int __init nf_nat_snmp_basic_init(void)
{
- int ret = 0;
-
BUG_ON(nf_nat_snmp_hook != NULL);
RCU_INIT_POINTER(nf_nat_snmp_hook, help);
- ret = nf_conntrack_helper_register(&snmp_trap_helper);
- if (ret < 0) {
- nf_conntrack_helper_unregister(&snmp_helper);
- return ret;
- }
- return ret;
+ return nf_conntrack_helper_register(&snmp_trap_helper);
}
static void __exit nf_nat_snmp_basic_fini(void)
{
RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+ synchronize_rcu();
nf_conntrack_helper_unregister(&snmp_trap_helper);
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fd8220213afc..146d86105183 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
new file mode 100644
index 000000000000..a83d558e1aae
--- /dev/null
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
+ __be32 *raddr, __be32 *laddr,
+ __be16 *rport, __be16 *lport)
+{
+ unsigned int outside_hdrlen = ip_hdrlen(skb);
+ struct iphdr *inside_iph, _inside_iph;
+ struct icmphdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_REDIRECT:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return 1;
+ }
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr),
+ sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+
+ if (inside_iph->protocol != IPPROTO_TCP &&
+ inside_iph->protocol != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr) +
+ (inside_iph->ihl << 2),
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_iph->protocol;
+ *laddr = inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet_lookup(net, &tcp_hashinfo, skb, doff,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+ return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be32 uninitialized_var(daddr), uninitialized_var(saddr);
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ u8 uninitialized_var(protocol);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
+ int doff = 0;
+
+ if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ protocol = iph->protocol;
+ saddr = iph->saddr;
+ sport = hp->source;
+ daddr = iph->daddr;
+ dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = iph->protocol == IPPROTO_TCP ?
+ ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+ ip_hdrlen(skb) + sizeof(*hp);
+
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+ &sport, &dport))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) &&
+ ((iph->protocol != IPPROTO_ICMP &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (iph->protocol == IPPROTO_ICMP &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+
+ daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+ dport = (iph->protocol == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
+ return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+ daddr, sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0c01a270bf9f..0af3d8df70dd 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr,
};
int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
- nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif);
+ nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif);
}
static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
new file mode 100644
index 000000000000..2981291910dd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -0,0 +1,236 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+#define DSCP_BITS 0xfc
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dst = &regs->data[priv->dreg];
+ const struct net_device *dev = NULL;
+ const struct iphdr *iph;
+ __be32 addr;
+
+ if (priv->flags & NFTA_FIB_F_IIF)
+ dev = nft_in(pkt);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ dev = nft_out(pkt);
+
+ iph = ip_hdr(pkt->skb);
+ if (priv->flags & NFTA_FIB_F_DADDR)
+ addr = iph->daddr;
+ else
+ addr = iph->saddr;
+
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
+
+static int get_ifindex(const struct net_device *dev)
+{
+ return dev ? dev->ifindex : 0;
+}
+
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct iphdr *iph;
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+ const struct net_device *oif;
+ struct net_device *found;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int i;
+#endif
+
+ /*
+ * Do not set flowi4_oif, it restricts results (for example, asking
+ * for oif 3 will get RTN_UNICAST result even if the daddr exits
+ * on another interface.
+ *
+ * Search results for the desired outinterface instead.
+ */
+ if (priv->flags & NFTA_FIB_F_OIF)
+ oif = nft_out(pkt);
+ else if (priv->flags & NFTA_FIB_F_IIF)
+ oif = nft_in(pkt);
+ else
+ oif = NULL;
+
+ if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+ nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ nft_in(pkt)->ifindex);
+ return;
+ }
+
+ iph = ip_hdr(pkt->skb);
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr)) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ get_ifindex(pkt->skb->dev));
+ return;
+ }
+ }
+
+ if (priv->flags & NFTA_FIB_F_MARK)
+ fl4.flowi4_mark = pkt->skb->mark;
+
+ fl4.flowi4_tos = iph->tos & DSCP_BITS;
+
+ if (priv->flags & NFTA_FIB_F_DADDR) {
+ fl4.daddr = iph->daddr;
+ fl4.saddr = get_saddr(iph->saddr);
+ } else {
+ fl4.daddr = iph->saddr;
+ fl4.saddr = get_saddr(iph->daddr);
+ }
+
+ *dest = 0;
+
+ if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+ return;
+
+ switch (res.type) {
+ case RTN_UNICAST:
+ break;
+ case RTN_LOCAL: /* Should not see RTN_LOCAL here */
+ return;
+ default:
+ break;
+ }
+
+ if (!oif) {
+ found = FIB_RES_DEV(res);
+ goto ok;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (i = 0; i < res.fi->fib_nhs; i++) {
+ struct fib_nh *nh = &res.fi->fib_nh[i];
+
+ if (nh->nh_dev == oif) {
+ found = nh->nh_dev;
+ goto ok;
+ }
+ }
+ return;
+#else
+ found = FIB_RES_DEV(res);
+ if (found != oif)
+ return;
+#endif
+ok:
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ *dest = found->ifindex;
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ strncpy((char *)dest, found->name, IFNAMSIZ);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval);
+
+static struct nft_expr_type nft_fib4_type;
+
+static const struct nft_expr_ops nft_fib4_type_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval_type,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops nft_fib4_ops = {
+ .type = &nft_fib4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+ .eval = nft_fib4_eval,
+ .init = nft_fib_init,
+ .dump = nft_fib_dump,
+ .validate = nft_fib_validate,
+};
+
+static const struct nft_expr_ops *
+nft_fib4_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_fib_result result;
+
+ if (!tb[NFTA_FIB_RESULT])
+ return ERR_PTR(-EINVAL);
+
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+ switch (result) {
+ case NFT_FIB_RESULT_OIF:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_OIFNAME:
+ return &nft_fib4_ops;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ return &nft_fib4_type_ops;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static struct nft_expr_type nft_fib4_type __read_mostly = {
+ .name = "fib",
+ .select_ops = &nft_fib4_select_ops,
+ .policy = nft_fib_policy,
+ .maxattr = NFTA_FIB_MAX,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fib4_module_init(void)
+{
+ return nft_register_expr(&nft_fib4_type);
+}
+
+static void __exit nft_fib4_module_exit(void)
+{
+ nft_unregister_expr(&nft_fib4_type);
+}
+
+module_init(nft_fib4_module_init);
+module_exit(nft_fib4_module_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 51ced81b616c..f18677277119 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -26,13 +26,19 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
if (priv->sreg_proto_min) {
- range.min_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min];
- range.max_proto.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max];
+ range.min_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
}
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
- &range, pkt->out);
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
+ &range, nft_out(pkt));
+}
+
+static void
+nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
}
static struct nft_expr_type nft_masq_ipv4_type;
@@ -41,6 +47,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv4_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init);
module_exit(nft_masq_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index c09d4381427e..5120be1d3118 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -26,17 +26,22 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
memset(&mr, 0, sizeof(mr));
if (priv->sreg_proto_min) {
- mr.range[0].min.all =
- *(__be16 *)&regs->data[priv->sreg_proto_min];
- mr.range[0].max.all =
- *(__be16 *)&regs->data[priv->sreg_proto_max];
+ mr.range[0].min.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_min]);
+ mr.range[0].max.all = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg_proto_max]);
mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
mr.range[0].flags |= priv->flags;
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
- pkt->hook);
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+}
+
+static void
+nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
}
static struct nft_expr_type nft_redir_ipv4_type;
@@ -45,6 +50,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv4_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -72,5 +78,5 @@ module_init(nft_redir_ipv4_module_init);
module_exit(nft_redir_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index 2c2553b9026c..517ce93699de 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,10 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
+ nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->net, pkt->skb, pkt->hook);
+ nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
break;
default:
break;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 96b8e2b95731..ccfbce13a633 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -156,17 +156,18 @@ int ping_hash(struct sock *sk)
void ping_unhash(struct sock *sk)
{
struct inet_sock *isk = inet_sk(sk);
+
pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ write_lock_bh(&ping_table.lock);
if (sk_hashed(sk)) {
- write_lock_bh(&ping_table.lock);
hlist_nulls_del(&sk->sk_nulls_node);
sk_nulls_node_init(&sk->sk_nulls_node);
sock_put(sk);
isk->inet_num = 0;
isk->inet_sport = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- write_unlock_bh(&ping_table.lock);
}
+ write_unlock_bh(&ping_table.lock);
}
EXPORT_SYMBOL_GPL(ping_unhash);
@@ -433,9 +434,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
}
- pr_debug("after bind(): num = %d, dif = %d\n",
- (int)isk->inet_num,
- (int)sk->sk_bound_dev_if);
+ pr_debug("after bind(): num = %hu, dif = %d\n",
+ isk->inet_num,
+ sk->sk_bound_dev_if);
err = 0;
if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
@@ -609,15 +610,15 @@ int ping_getfrag(void *from, char *to,
fraglen -= sizeof(struct icmphdr);
if (fraglen < 0)
BUG();
- if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+ if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr),
fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter) != fraglen)
+ &pfh->msg->msg_iter))
return -EFAULT;
} else if (offset < sizeof(struct icmphdr)) {
BUG();
} else {
- if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter) != fraglen)
+ if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter))
return -EFAULT;
}
@@ -642,6 +643,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
{
struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
+ return 0;
pfh->wcheck = csum_partial((char *)&pfh->icmph,
sizeof(struct icmphdr), pfh->wcheck);
pfh->icmph.checksum = csum_fold(pfh->wcheck);
@@ -793,7 +796,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
@@ -847,7 +851,8 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
unsigned int frag_mem;
int orphans, sockets;
- local_bh_disable();
orphans = percpu_counter_sum_positive(&tcp_orphan_count);
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
- local_bh_enable();
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&tcp_death_row.tw_count), sockets,
+ atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
@@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ecbe5a7c2d6d..9d943974de2b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -41,7 +41,7 @@
#include <linux/atomic.h>
#include <asm/byteorder.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/stddef.h>
#include <linux/slab.h>
@@ -89,9 +89,10 @@ struct raw_frag_vec {
int hlen;
};
-static struct raw_hashinfo raw_v4_hashinfo = {
+struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
};
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
int raw_hash_sk(struct sock *sk)
{
@@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk)
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);
-static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
sk_for_each_from(sk) {
@@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
found:
return sk;
}
+EXPORT_SYMBOL_GPL(__raw_v4_lookup);
/*
* 0 - deliver
@@ -381,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_from_msg(iph, msg, length))
@@ -604,7 +609,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0);
+ daddr, saddr, 0, 0, sk->sk_uid);
if (!inet->hdrincl) {
rfv.msg = msg;
@@ -664,7 +669,8 @@ out:
return len;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -676,7 +682,9 @@ static void raw_close(struct sock *sk, long timeout)
/*
* Raw sockets may have direct kernel references. Kill them.
*/
+ rtnl_lock();
ip_ra_control(sk, 0, NULL);
+ rtnl_unlock();
sk_common_release(sk);
}
@@ -693,12 +701,20 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+
+ if (sk->sk_bound_dev_if)
+ tb_id = l3mdev_fib_table_by_index(sock_net(sk),
+ sk->sk_bound_dev_if) ? : tb_id;
+
+ chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
+ tb_id);
+
ret = -EADDRNOTAVAIL;
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
@@ -912,6 +928,20 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg
}
#endif
+int raw_abort(struct sock *sk, int err)
+{
+ lock_sock(sk);
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ __udp_disconnect(sk, 0);
+
+ release_sock(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_abort);
+
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
@@ -937,6 +967,7 @@ struct proto raw_prot = {
.compat_getsockopt = compat_raw_getsockopt,
.compat_ioctl = compat_raw_ioctl,
#endif
+ .diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
new file mode 100644
index 000000000000..e1a51ca68d23
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,266 @@
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+ if (r->sdiag_family == AF_INET) {
+ return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (r->sdiag_family == AF_INET6) {
+ return &raw_v6_hashinfo;
+#endif
+ } else {
+ pr_warn_once("Unexpected inet family %d\n",
+ r->sdiag_family);
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EINVAL);
+ }
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static struct sock *raw_lookup(struct net *net, struct sock *from,
+ const struct inet_diag_req_v2 *req)
+{
+ struct inet_diag_req_raw *r = (void *)req;
+ struct sock *sk = NULL;
+
+ if (r->sdiag_family == AF_INET)
+ sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
+ r->id.idiag_dst[0],
+ r->id.idiag_src[0],
+ r->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
+ (const struct in6_addr *)r->id.idiag_src,
+ (const struct in6_addr *)r->id.idiag_dst,
+ r->id.idiag_if);
+#endif
+ return sk;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct sock *sk = NULL, *s;
+ int slot;
+
+ if (IS_ERR(hashinfo))
+ return ERR_CAST(hashinfo);
+
+ read_lock(&hashinfo->lock);
+ for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+ sk_for_each(s, &hashinfo->ht[slot]) {
+ sk = raw_lookup(net, s, r);
+ if (sk) {
+ /*
+ * Grab it and keep until we fill
+ * diag meaage to be reported, so
+ * caller should call sock_put then.
+ * We can do that because we're keeping
+ * hashinfo->lock here.
+ */
+ sock_hold(sk);
+ goto out_unlock;
+ }
+ }
+ }
+out_unlock:
+ read_unlock(&hashinfo->lock);
+
+ return sk ? sk : ERR_PTR(-ENOENT);
+}
+
+static int raw_diag_dump_one(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
+ if (!rep) {
+ sock_put(sk);
+ return -ENOMEM;
+ }
+
+ err = inet_sk_diag_fill(sk, NULL, rep, r,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ sock_put(sk);
+
+ if (err < 0) {
+ kfree_skb(rep);
+ return err;
+ }
+
+ err = netlink_unicast(net->diag_nlsk, rep,
+ NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+ return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc, bool net_admin)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh, net_admin);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+ struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
+ struct sock *sk = NULL;
+
+ if (IS_ERR(hashinfo))
+ return;
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ read_lock(&hashinfo->lock);
+ for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+ num = 0;
+
+ sk_for_each(sk, &hashinfo->ht[slot]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ goto out_unlock;
+next:
+ num++;
+ }
+ }
+
+out_unlock:
+ read_unlock(&hashinfo->lock);
+
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int raw_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *r)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ sk = raw_sock_get(net, r);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+ err = sock_diag_destroy(sk, ECONNABORTED);
+ sock_put(sk);
+ return err;
+}
+#endif
+
+static const struct inet_diag_handler raw_diag_handler = {
+ .dump = raw_diag_dump,
+ .dump_one = raw_diag_dump_one,
+ .idiag_get_info = raw_diag_get_info,
+ .idiag_type = IPPROTO_RAW,
+ .idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = raw_diag_destroy,
+#endif
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+ /*
+ * Make sure the two structures are identical,
+ * except the @pad field.
+ */
+#define __offset_mismatch(m1, m2) \
+ (offsetof(struct inet_diag_req_v2, m1) != \
+ offsetof(struct inet_diag_req_raw, m2))
+
+ BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+ sizeof(struct inet_diag_req_raw));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+ BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+ BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+ BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+ BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+ return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+ inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2a57566e6e91..acd69cfe2951 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -65,7 +65,7 @@
#define pr_fmt(fmt) "IPv4: " fmt
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
.redirect = ip_do_redirect,
.local_out = __ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
+ .confirm_neigh = ipv4_confirm_neigh,
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ const struct rtable *rt;
+
+ rt = (const struct rtable *)dst;
+ if (rt->rt_gateway)
+ pkey = (const __be32 *)&rt->rt_gateway;
+ else if (!daddr ||
+ (rt->rt_flags &
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+ return;
+
+ __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
#define IP_IDENTS_SZ 2048u
static atomic_t *ip_idents __read_mostly;
@@ -507,7 +526,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
u8 prot, u32 mark, int flow_flags)
@@ -523,19 +543,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
flowi4_init_output(fl4, oif, mark, tos,
RT_SCOPE_UNIVERSE, prot,
flow_flags,
- iph->daddr, iph->saddr, 0, 0);
+ iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -552,7 +574,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0);
+ daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
}
@@ -795,6 +817,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
struct rtable *rt;
struct flowi4 fl4;
const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
@@ -802,7 +825,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
- __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1020,7 +1043,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
if (!mark)
mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1036,7 +1059,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
if (!fl4.flowi4_mark)
fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1055,6 +1078,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
struct dst_entry *odst = NULL;
bool new = false;
+ struct net *net = sock_net(sk);
bh_lock_sock(sk);
@@ -1068,7 +1092,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
goto out;
}
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1108,7 +1132,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1123,9 +1147,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ struct net *net = sock_net(sk);
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = __ip_route_output_key(sock_net(sk), &fl4);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
@@ -1598,6 +1623,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_unlock_bh(&fnhe_lock);
}
+static void set_lwt_redirect(struct rtable *rth)
+{
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1687,14 +1725,7 @@ rt_cache:
rth->dst.input = ip_forward;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
- if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_output = rth->dst.output;
- rth->dst.output = lwtunnel_output;
- }
- if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_input = rth->dst.input;
- rth->dst.input = lwtunnel_input;
- }
+ set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1746,7 +1777,6 @@ standard_hash:
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
- const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
@@ -1846,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
err = fib_lookup(net, &fl4, &res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
@@ -1871,7 +1902,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
@@ -1902,7 +1933,8 @@ local_input:
}
}
- rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
+ rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
+ flags | RTCF_LOCAL, res.type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
@@ -1921,8 +1953,18 @@ local_input:
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
+
if (do_cache) {
- if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ WARN_ON(rth->dst.input == lwtunnel_input);
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+
+ if (unlikely(!rt_cache_route(nh, rth))) {
rth->dst.flags |= DST_NOCACHE;
rt_add_uncached_list(rth);
}
@@ -1967,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
int res;
+ tos &= IPTOS_RT_MASK;
rcu_read_lock();
/* Multicast recognition logic is moved from route cache to here.
@@ -1982,25 +2025,35 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ int our = 0;
- if (in_dev) {
- int our = ip_check_mc_rcu(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
- if (our
+ if (in_dev)
+ our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+
+ /* check l3 master if no match yet */
+ if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
+ struct in_device *l3_in_dev;
+
+ l3_in_dev = __in_dev_get_rcu(skb->dev);
+ if (l3_in_dev)
+ our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+ }
+
+ res = -EINVAL;
+ if (our
#ifdef CONFIG_IP_MROUTE
- ||
- (!ipv4_is_local_multicast(daddr) &&
- IN_DEV_MFORWARD(in_dev))
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
- ) {
- int res = ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
- rcu_read_unlock();
- return res;
- }
+ ) {
+ res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
}
rcu_read_unlock();
- return -EINVAL;
+ return res;
}
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
rcu_read_unlock();
@@ -2140,8 +2193,7 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- if (lwtunnel_output_redirect(rth->dst.lwtstate))
- rth->dst.output = lwtunnel_output;
+ set_lwt_redirect(rth);
return rth;
}
@@ -2268,7 +2320,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
res.fi = NULL;
res.table = NULL;
if (fl4->flowi4_oif &&
- !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
+ (ipv4_is_multicast(fl4->daddr) ||
+ !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2421,7 +2474,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq, int event, int nowait, unsigned int flags)
+ u32 seq, int event)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
@@ -2430,7 +2483,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
if (!nlh)
return -EMSGSIZE;
@@ -2439,7 +2492,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
r->rtm_tos = fl4->flowi4_tos;
- r->rtm_table = table_id;
+ r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
r->rtm_type = rt->rt_type;
@@ -2495,6 +2548,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+ goto nla_put_failure;
+
error = rt->dst.error;
if (rt_is_input_route(rt)) {
@@ -2503,18 +2561,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
int err = ipmr_get_route(net, skb,
fl4->saddr, fl4->daddr,
- r, nowait, portid);
+ r, portid);
if (err <= 0) {
- if (!nowait) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- } else {
- if (err == -EMSGSIZE)
- goto nla_put_failure;
- error = err;
- }
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
}
} else
#endif
@@ -2547,6 +2599,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int mark;
struct sk_buff *skb;
u32 table_id = RT_TABLE_MAIN;
+ kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2567,13 +2620,17 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
skb_reset_network_header(skb);
/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
- ip_hdr(skb)->protocol = IPPROTO_ICMP;
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2581,6 +2638,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_tos = rtm->rtm_tos;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = uid;
if (iif) {
struct net_device *dev;
@@ -2594,9 +2652,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
skb->mark = mark;
- local_bh_disable();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
- local_bh_enable();
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
@@ -2621,7 +2677,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- RTM_NEWROUTE, 0, 0);
+ RTM_NEWROUTE);
if (err < 0)
goto errout_free;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e3c4043c27de..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
#include <linux/tcp.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <net/tcp.h>
#include <net/route.h>
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie_secret[2] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
-
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 *tmp;
-
net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
-
- tmp = this_cpu_ptr(ipv4_cookie_scratch);
- memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
- tmp[0] = (__force u32)saddr;
- tmp[1] = (__force u32)daddr;
- tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
- tmp[3] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
- return tmp[17];
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ count, &syncookie_secret[c]);
}
@@ -334,6 +323,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq = tcp_rsk(req);
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
@@ -372,7 +362,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 80bc36b25de2..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
- if (range[1] < range[0])
+ /* Ensure that the upper limit is not smaller than the lower,
+ * and that the lower does not encroach upon the privileged
+ * port limit.
+ */
+ if ((range[1] < range[0]) ||
+ (range[0] < net->ipv4.sysctl_ip_prot_sock))
ret = -EINVAL;
else
set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
return ret;
}
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_prot_sock);
+ int ret;
+ int pports;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &pports,
+ .maxlen = sizeof(pports),
+ .mode = table->mode,
+ .extra1 = &ip_privileged_port_min,
+ .extra2 = &ip_privileged_port_max,
+ };
+
+ pports = net->ipv4.sysctl_ip_prot_sock;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ inet_get_local_port_range(net, &range[0], &range[1]);
+ /* Ensure that the local port range doesn't overlap with the
+ * privileged port range.
+ */
+ if (range[0] < pports)
+ ret = -EINVAL;
+ else
+ net->ipv4.sysctl_ip_prot_sock = pports;
+ }
+
+ return ret;
+}
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
{
@@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_max_tw_buckets",
- .data = &tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_fastopen",
.data = &sysctl_tcp_fastopen,
.maxlen = sizeof(int),
@@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_fastopen_key,
},
{
- .procname = "tcp_tw_recycle",
- .data = &tcp_death_row.sysctl_tw_recycle,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow,
.maxlen = sizeof(int),
@@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_max_syn_backlog",
- .data = &sysctl_max_syn_backlog,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -433,13 +453,6 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &tcp_adv_win_scale_max,
},
{
- .procname = "tcp_tw_reuse",
- .data = &sysctl_tcp_tw_reuse,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_frto",
.data = &sysctl_tcp_frto,
.maxlen = sizeof(int),
@@ -565,13 +578,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_thin_dupack",
- .data = &sysctl_tcp_thin_dupack,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_early_retrans",
.data = &sysctl_tcp_early_retrans,
.maxlen = sizeof(int),
@@ -958,7 +964,35 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "tcp_tw_reuse",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_tw_buckets",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_tw_recycle",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_syn_backlog",
+ .data = &init_net.ipv4.sysctl_max_syn_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
},
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
@@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "ip_unprivileged_port_start",
+ .maxlen = sizeof(int),
+ .data = &init_net.ipv4.sysctl_ip_prot_sock,
+ .mode = 0644,
+ .proc_handler = ipv4_privileged_ports,
+ },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "udp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 814af89c1bd3..40ba4249a586 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,9 +277,8 @@
#include <net/ip.h>
#include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
-#include <asm/unaligned.h>
#include <net/busy_poll.h>
int sysctl_tcp_min_tso_segs __read_mostly = 2;
@@ -405,10 +404,8 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
- u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
- tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
@@ -423,15 +420,13 @@ void tcp_init_sock(struct sock *sk)
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
- local_bh_disable();
sk_sockets_allocated_inc(sk);
- local_bh_enable();
}
EXPORT_SYMBOL(tcp_init_sock);
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
{
- if (tsflags) {
+ if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -538,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
+ } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* Active TCP fastopen socket with defer_connect
+ * Return POLLOUT so application can call write()
+ * in order for kernel to generate SYN+data
+ */
+ mask |= POLLOUT | POLLWRNORM;
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
@@ -665,9 +666,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
@@ -772,6 +773,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ret = -EAGAIN;
break;
}
+ /* if __tcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
sk_wait_data(sk, &timeo, NULL);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
@@ -960,10 +967,8 @@ new_segment:
copied += copy;
offset += copy;
size -= copy;
- if (!size) {
- tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+ if (!size)
goto out;
- }
if (skb->len < size_goal || (flags & MSG_OOB))
continue;
@@ -989,8 +994,11 @@ wait_for_memory:
}
out:
- if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
- tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ if (copied) {
+ tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+ if (!(flags & MSG_SENDPAGE_NOTLAST))
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ }
return copied;
do_error:
@@ -998,8 +1006,11 @@ do_error:
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
return sk_stream_error(sk, flags, err);
}
@@ -1072,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int *copied, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
int err, flags;
if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1086,11 +1098,26 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ if (inet->defer_connect) {
+ err = tcp_connect(sk);
+ /* Same failure procedure as in tcp_v4/6_connect */
+ if (err) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ }
+ }
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
- msg->msg_namelen, flags);
- *copied = tp->fastopen_req->copied;
- tcp_free_fastopen_req(tp);
+ msg->msg_namelen, flags, 1);
+ /* fastopen_req could already be freed in __inet_stream_connect
+ * if the connection times out or gets rst
+ */
+ if (tp->fastopen_req) {
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ inet->defer_connect = 0;
+ }
return err;
}
@@ -1108,7 +1135,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
flags = msg->msg_flags;
- if (flags & MSG_FASTOPEN) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -1266,7 +1293,7 @@ new_segment:
} else {
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
- get_page(pfrag->page);
+ page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
}
@@ -1280,7 +1307,6 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
- tcp_tx_timestamp(sk, sockc.tsflags, skb);
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1;
goto out;
@@ -1311,8 +1337,10 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied) {
+ tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ }
out_nopush:
release_sock(sk);
return copied + copied_syn;
@@ -1333,8 +1361,11 @@ do_error:
out_err:
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
release_sock(sk);
return err;
}
@@ -2291,6 +2322,11 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
+ tcp_saved_syn_free(tp);
+
+ /* Clean up fastopen related fields */
+ tcp_free_fastopen_req(tp);
+ inet->defer_connect = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -2302,7 +2338,7 @@ EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
- ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+ (sk->sk_state != TCP_LISTEN);
}
static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
@@ -2469,11 +2505,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
if (val < 0 || val > 1)
err = -EINVAL;
- else {
- tp->thin_dupack = val;
- if (tp->thin_dupack)
- tcp_disable_early_retrans(tp);
- }
break;
case TCP_REPAIR:
@@ -2658,6 +2689,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
}
break;
+ case TCP_FASTOPEN_CONNECT:
+ if (val > 1 || val < 0) {
+ err = -EINVAL;
+ } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ if (sk->sk_state == TCP_CLOSE)
+ tp->fastopen_connect = val;
+ else
+ err = -EINVAL;
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -2704,15 +2747,33 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
+static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
+ struct tcp_info *info)
+{
+ u64 stats[__TCP_CHRONO_MAX], total = 0;
+ enum tcp_chrono i;
+
+ for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
+ stats[i] = tp->chrono_stat[i - 1];
+ if (i == tp->chrono_type)
+ stats[i] += tcp_time_stamp - tp->chrono_start;
+ stats[i] *= USEC_PER_SEC / HZ;
+ total += stats[i];
+ }
+
+ info->tcpi_busy_time = total;
+ info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
+ info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
+}
+
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 now = tcp_time_stamp, intv;
- unsigned int start;
- int notsent_bytes;
+ u32 now, intv;
u64 rate64;
+ bool slow;
u32 rate;
memset(info, 0, sizeof(*info));
@@ -2721,6 +2782,30 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_state = sk_state_load(sk);
+ /* Report meaningful fields for all TCP states, including listeners */
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ info->tcpi_pacing_rate = rate64;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ info->tcpi_max_pacing_rate = rate64;
+
+ info->tcpi_reordering = tp->reordering;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+
+ if (info->tcpi_state == TCP_LISTEN) {
+ /* listeners aliased fields :
+ * tcpi_unacked -> Number of children ready for accept()
+ * tcpi_sacked -> max backlog
+ */
+ info->tcpi_unacked = sk->sk_ack_backlog;
+ info->tcpi_sacked = sk->sk_max_ack_backlog;
+ return;
+ }
+
+ slow = lock_sock_fast(sk);
+
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2748,17 +2833,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (info->tcpi_state == TCP_LISTEN) {
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
- } else {
- info->tcpi_unacked = tp->packets_out;
- info->tcpi_sacked = tp->sacked_out;
- }
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
+ now = tcp_time_stamp;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
@@ -2768,34 +2850,21 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
- info->tcpi_reordering = tp->reordering;
info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
- rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
- put_unaligned(rate64, &info->tcpi_pacing_rate);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+ tcp_get_info_chrono_stats(tp, info);
- rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
- put_unaligned(rate64, &info->tcpi_max_pacing_rate);
-
- do {
- start = u64_stats_fetch_begin_irq(&tp->syncp);
- put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
- put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
- } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
- notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
- info->tcpi_notsent_bytes = max(0, notsent_bytes);
-
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
@@ -2806,11 +2875,36 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (rate && intv) {
rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
do_div(rate64, intv);
- put_unaligned(rate64, &info->tcpi_delivery_rate);
+ info->tcpi_delivery_rate = rate64;
}
+ unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *stats;
+ struct tcp_info info;
+
+ stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ tcp_get_info_chrono_stats(tp, &info);
+ nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+ info.tcpi_busy_time, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+ info.tcpi_rwnd_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+ info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+ tp->data_segs_out, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+ tp->total_retrans, TCP_NLA_PAD);
+ return stats;
+}
+
static int do_tcp_getsockopt(struct sock *sk, int level,
int optname, char __user *optval, int __user *optlen)
{
@@ -2917,8 +3011,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
+
case TCP_THIN_DUPACK:
- val = tp->thin_dupack;
+ val = 0;
break;
case TCP_REPAIR:
@@ -2971,6 +3066,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
+ case TCP_FASTOPEN_CONNECT:
+ val = tp->fastopen_connect;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
@@ -3284,6 +3383,7 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+ inet_hashinfo_init(&tcp_hashinfo);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
@@ -3327,10 +3427,7 @@ void __init tcp_init(void)
cnt = tcp_hashinfo.ehash_mask + 1;
-
- tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
sysctl_tcp_max_orphans = cnt / 2;
- sysctl_max_syn_backlog = max(128, cnt / 256);
tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
@@ -3349,6 +3446,7 @@ void __init tcp_init(void)
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ tcp_v4_init();
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 0ea66c2c9344..b89bce4c721e 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -14,6 +14,36 @@
* observed, or adjust the sending rate if it estimates there is a
* traffic policer, in order to keep the drop rate reasonable.
*
+ * Here is a state transition diagram for BBR:
+ *
+ * |
+ * V
+ * +---> STARTUP ----+
+ * | | |
+ * | V |
+ * | DRAIN ----+
+ * | | |
+ * | V |
+ * +---> PROBE_BW ----+
+ * | ^ | |
+ * | | | |
+ * | +----+ |
+ * | |
+ * +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
* BBR is described in detail in:
* "BBR: Congestion-Based Congestion Control",
* Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
@@ -51,7 +81,7 @@ enum bbr_mode {
BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */
BBR_DRAIN, /* drain any queue created during startup */
BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */
- BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */
+ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
};
/* BBR congestion control block */
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 35b280361cb2..50a0f3e51d5b 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -27,6 +27,8 @@
#include <linux/kernel.h>
#include <linux/random.h>
#include <linux/module.h>
+#include <linux/sched/clock.h>
+
#include <net/tcp.h>
#define HYSTART_ACK_TRAIN 1
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index f9038d6b109e..79c4817abc94 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
int ret = 0;
- /* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
+ /* all algorithms must implement these */
+ if (!ca->ssthresh || !ca->undo_cwnd ||
+ !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
@@ -443,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+u32 tcp_reno_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
+
struct tcp_congestion_ops tcp_reno = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
};
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ab37c6775630..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -335,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = {
static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.get_info = dctcp_get_info,
.owner = THIS_MODULE,
.name = "dctcp-reno",
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4e777a3243f9..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -113,7 +113,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
struct tcp_fastopen_cookie tmp;
if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
- struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ struct in6_addr *buf = &tmp.addr;
int i;
for (i = 0; i < 4; i++)
@@ -205,6 +205,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
* scaled. So correct it appropriately.
*/
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+ tp->max_window = tp->snd_wnd;
/* Activate the retrans timer so that SYNACK can be retransmitted.
* The request socket is not added to the ehash
@@ -325,3 +326,57 @@ fastopen:
*foc = valid_foc;
return NULL;
}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ unsigned long last_syn_loss = 0;
+ int syn_loss = 0;
+
+ tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+ /* Recurring FO SYN losses: no cookie or data in SYN */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ cookie->len = -1;
+ return false;
+ }
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ cookie->len = -1;
+ return true;
+ }
+ return cookie->len > 0;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write(). We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ * return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+ struct tcp_fastopen_cookie cookie = { .len = 0 };
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (tp->fastopen_connect && !tp->fastopen_req) {
+ if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+ inet_sk(sk)->defer_connect = 1;
+ return true;
+ }
+
+ /* Alloc fastopen_req in order for FO option to be included
+ * in SYN
+ */
+ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+ sk->sk_allocation);
+ if (tp->fastopen_req)
+ tp->fastopen_req->cookie = cookie;
+ else
+ *err = -ENOBUFS;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index db7842495a64..6d9879e93648 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,6 +94,7 @@ static const struct hstcp_aimd_val {
struct hstcp {
u32 ai;
+ u32 loss_cwnd;
};
static void hstcp_init(struct sock *sk)
@@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 hstcp_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- const struct hstcp *ca = inet_csk_ca(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
+static u32 hstcp_cwnd_undo(struct sock *sk)
+{
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
+ .undo_cwnd = hstcp_cwnd_undo,
.cong_avoid = hstcp_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 083831e359df..0f7175c3338e 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index c8e6d86be114..60352ff4f5a8 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,6 +48,7 @@ struct illinois {
u32 end_seq; /* right edge of current RTT */
u32 alpha; /* Additive increase */
u32 beta; /* Muliplicative decrease */
+ u32 loss_cwnd; /* cwnd on loss */
u16 acked; /* # packets acked by current ACK */
u8 rtt_above; /* average rtt has gone above threshold */
u8 rtt_low; /* # of rtts measurements below threshold */
@@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
+static u32 tcp_illinois_cwnd_undo(struct sock *sk)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
/* Extract info for Tcp socket info provided via netlink. */
static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
@@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
+ .undo_cwnd = tcp_illinois_cwnd_undo,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c71d49ce0c93..659d1baefb2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,12 +79,13 @@
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
int sysctl_tcp_max_reordering __read_mostly = 300;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
/* rfc5961 challenge ack rate limiting */
int sysctl_tcp_challenge_ack_limit = 1000;
@@ -94,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -128,7 +126,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
-static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
+ unsigned int len)
{
static bool __once __read_mostly;
@@ -139,8 +138,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
- pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
- dev ? dev->name : "Unknown driver");
+ if (!dev || len >= dev->mtu)
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
rcu_read_unlock();
}
}
@@ -163,8 +163,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
if (len >= icsk->icsk_ack.rcv_mss) {
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
- if (unlikely(icsk->icsk_ack.rcv_mss != len))
- tcp_gro_dev_warn(sk, skb);
+ /* Account for possibly-removed options */
+ if (unlikely(len > icsk->icsk_ack.rcv_mss +
+ MAX_TCP_OPTION_SPACE))
+ tcp_gro_dev_warn(sk, skb, len);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -876,22 +878,11 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (metric > tp->reordering) {
- int mib_idx;
+ int mib_idx;
+ if (metric > tp->reordering) {
tp->reordering = min(sysctl_tcp_max_reordering, metric);
- /* This exciting event is worth to be remembered. 8) */
- if (ts)
- mib_idx = LINUX_MIB_TCPTSREORDER;
- else if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENOREORDER;
- else if (tcp_is_fack(tp))
- mib_idx = LINUX_MIB_TCPFACKREORDER;
- else
- mib_idx = LINUX_MIB_TCPSACKREORDER;
-
- NET_INC_STATS(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -903,9 +894,19 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
tcp_disable_fack(tp);
}
- if (metric > 0)
- tcp_disable_early_retrans(tp);
tp->rack.reord = 1;
+
+ /* This exciting event is worth to be remembered. 8) */
+ if (ts)
+ mib_idx = LINUX_MIB_TCPTSREORDER;
+ else if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENOREORDER;
+ else if (tcp_is_fack(tp))
+ mib_idx = LINUX_MIB_TCPFACKREORDER;
+ else
+ mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+ NET_INC_STATS(sock_net(sk), mib_idx);
}
/* This must be called before lost_out is incremented */
@@ -915,10 +916,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
-
- if (!tp->lost_out ||
- after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
/* Sum the number of packets on the wire we have marked as lost.
@@ -1134,6 +1131,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
};
@@ -1216,7 +1214,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
- tcp_rack_advance(tp, xmit_time, sacked);
+ tcp_rack_advance(tp, sacked, end_seq,
+ xmit_time, &state->ack_time);
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -1981,7 +1980,6 @@ void tcp_enter_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
}
tcp_verify_left_out(tp);
@@ -2000,6 +1998,11 @@ void tcp_enter_loss(struct sock *sk)
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ *
+ * In theory F-RTO can be used repeatedly during loss recovery.
+ * In practice this interacts badly with broken middle-boxes that
+ * falsely raise the receive window, which results in repeated
+ * timeouts and stop-and-go behavior.
*/
tp->frto = sysctl_tcp_frto &&
(new_recovery || icsk->icsk_retransmits) &&
@@ -2055,30 +2058,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- unsigned long delay;
-
- /* Delay early retransmit and entering fast recovery for
- * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
- * available, or RTO is scheduled to fire first.
- */
- if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
- (flag & FLAG_ECE) || !tp->srtt_us)
- return false;
-
- delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
- msecs_to_jiffies(2));
-
- if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
- return false;
-
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
- TCP_RTO_MAX);
- return true;
-}
-
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
@@ -2126,10 +2105,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
- * Essentially, we have now two algorithms counting
+ * Essentially, we have now a few algorithms detecting
* lost packets.
*
- * FACK: It is the simplest heuristics. As soon as we decided
+ * If the receiver supports SACK:
+ *
+ * RFC6675/3517: It is the conventional algorithm. A packet is
+ * considered lost if the number of higher sequence packets
+ * SACKed is greater than or equal the DUPACK thoreshold
+ * (reordering). This is implemented in tcp_mark_head_lost and
+ * tcp_update_scoreboard.
+ *
+ * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * (2017-) that checks timing instead of counting DUPACKs.
+ * Essentially a packet is considered lost if it's not S/ACKed
+ * after RTT + reordering_window, where both metrics are
+ * dynamically measured and adjusted. This is implemented in
+ * tcp_rack_mark_lost.
+ *
+ * FACK (Disabled by default. Subsumbed by RACK):
+ * It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2138,16 +2133,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
- * NewReno: when Recovery is entered, we assume that one segment
+ * If the receiver does not support SACK:
+ *
+ * NewReno (RFC6582): in Recovery we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Imagine, that's all! Forget about all this shamanism about CWND inflation
- * deflation etc. CWND is real congestion window, never inflated, changes
- * only according to classic VJ rules.
- *
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
@@ -2175,8 +2168,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- __u32 packets_out;
- int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2186,39 +2177,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
- /* Trick#4: It is still not OK... But will it be useful to delay
- * recovery more?
- */
- packets_out = tp->packets_out;
- if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
- !tcp_may_send_now(sk)) {
- /* We have nothing to send. This connection is limited
- * either by receiver window or by application.
- */
- return true;
- }
-
- /* If a thin stream is detected, retransmit after first
- * received dupack. Employ only if SACK is supported in order
- * to avoid possible corner-case series of spurious retransmissions
- * Use only if there are no unsent data.
- */
- if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
- tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
- tcp_is_sack(tp) && !tcp_send_head(sk))
- return true;
-
- /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
- * retransmissions due to small network reorderings, we implement
- * Mitigation A.3 in the RFC and delay the retransmission for a short
- * interval if appropriate.
- */
- if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
- (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
- !tcp_may_send_now(sk))
- return !tcp_pause_early_retransmit(sk, flag);
-
return false;
}
@@ -2414,10 +2372,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->undo_cwnd)
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
- else
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2523,8 +2478,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
- int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2692,7 +2646,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
EXPORT_SYMBOL(tcp_simple_retransmit);
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
@@ -2728,14 +2682,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
tcp_try_undo_loss(sk, false))
return;
- if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
- /* Step 3.b. A timeout is spurious if not all data are
- * lost, i.e., never-retransmitted data are (s)acked.
- */
- if ((flag & FLAG_ORIG_SACK_ACKED) &&
- tcp_try_undo_loss(sk, true))
- return;
+ /* The ACK (s)acks some never-retransmitted data meaning not all
+ * the data packets before the timeout were lost. Therefore we
+ * undo the congestion window and state. This is essentially
+ * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+ * a retransmitted skb is permantly marked, we can apply such an
+ * operation even if F-RTO was not used.
+ */
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, tp->undo_marker))
+ return;
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
@@ -2802,6 +2760,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
return false;
}
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+ const struct skb_mstamp *ack_time)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ u32 prior_retrans = tp->retrans_out;
+
+ tcp_rack_mark_lost(sk, ack_time);
+ if (prior_retrans > tp->retrans_out)
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2815,7 +2788,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- bool is_dupack, int *ack_flag, int *rexmit)
+ bool is_dupack, int *ack_flag, int *rexmit,
+ const struct skb_mstamp *ack_time)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2866,13 +2840,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
- /* Use RACK to detect loss */
- if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
- tcp_rack_mark_lost(sk)) {
- flag |= FLAG_LOST_RETRANS;
- *ack_flag |= FLAG_LOST_RETRANS;
- }
-
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2890,11 +2857,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_keep_open(sk);
return;
}
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- if (icsk->icsk_ca_state != TCP_CA_Open &&
- !(flag & FLAG_LOST_RETRANS))
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
+ if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+ (*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
default:
@@ -2908,6 +2877,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -3026,7 +2996,7 @@ void tcp_rearm_rto(struct sock *sk)
} else {
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
- if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp =
@@ -3043,24 +3013,6 @@ void tcp_rearm_rto(struct sock *sk)
}
}
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_rearm_rto(sk);
-
- /* Stop if ER is disabled after the delayed ER timer is scheduled */
- if (!tp->do_early_retrans)
- return;
-
- tcp_enter_recovery(sk, false);
- tcp_update_scoreboard(sk, 1);
- tcp_xmit_retransmit_queue(sk);
-}
-
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
@@ -3103,11 +3055,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack,
- struct skb_mstamp *now)
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
+ struct skb_mstamp *now = &sack->ack_time;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3167,7 +3119,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
- tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ tcp_rack_advance(tp, sacked, scb->end_seq,
+ &skb->skb_mstamp,
+ &sack->ack_time);
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3201,6 +3155,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->lost_skb_hint = NULL;
}
+ if (!skb)
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
@@ -3371,9 +3328,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp);
- u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
- u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@@ -3383,9 +3338,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp);
- u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
- u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}
@@ -3598,7 +3551,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
- struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
sack_state.rate = &rs;
@@ -3624,10 +3576,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- skb_mstamp_get(&now);
+ skb_mstamp_get(&sack_state.ack_time);
- if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
if (after(ack, prior_snd_una)) {
@@ -3692,34 +3643,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state, &now);
+ &sack_state);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
- struct dst_entry *dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
- }
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+ sk_dst_confirm(sk);
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
- tcp_rate_gen(sk, delivered, lost, &now, &rs);
- tcp_cong_control(sk, ack, delivered, flag, &rs);
+ tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+ sack_state.rate);
+ tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3740,9 +3691,11 @@ old_ack:
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
+ skb_mstamp_get(&sack_state.ack_time);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4560,6 +4513,7 @@ add_sack:
end:
if (skb) {
tcp_grow_window(sk, skb);
+ skb_condense(skb);
skb_set_owner_r(skb, sk);
}
}
@@ -5081,10 +5035,13 @@ static void tcp_check_space(struct sock *sk)
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/* pairs with tcp_poll() */
- smp_mb__after_atomic();
+ smp_mb();
if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
}
}
@@ -5249,6 +5206,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
return err;
}
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+ (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+ TCPF_CLOSING));
+}
+
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5287,20 +5261,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
+ } else if (tcp_reset_check(sk, skb)) {
+ tcp_reset(sk);
}
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
- /* RFC 5961 3.2 (extend to match against SACK too if available):
- * If seq num matches RCV.NXT or the right-most SACK block,
+ /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+ * FIN and SACK too if available):
+ * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+ * the right-most SACK block,
* then
* RESET the connection
* else
* Send a challenge ACK
*/
- if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+ tcp_reset_check(sk, skb)) {
rst_seq_match = true;
} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
@@ -5571,6 +5550,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_set_state(sk, TCP_ESTABLISHED);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -5789,7 +5769,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- icsk->icsk_ack.lrcvtime = tcp_time_stamp;
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5916,9 +5895,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (th->syn) {
if (th->fin)
goto discard;
- if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
- return 1;
+ /* It is possible that we process SYN packets from backlog,
+ * so we need to make sure to disable BH right there.
+ */
+ local_bh_disable();
+ acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
+ local_bh_enable();
+ if (!acceptable)
+ return 1;
consume_skb(skb);
return 0;
}
@@ -6022,7 +6007,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
case TCP_FIN_WAIT1: {
- struct dst_entry *dst;
int tmo;
/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6033,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
- dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
+ sk_dst_confirm(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
@@ -6318,13 +6300,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
-
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6334,6 +6310,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
tcp_rsk(req)->af_specific = af_ops;
+ tcp_rsk(req)->ts_off = 0;
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6355,6 +6332,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
+ if (isn && tmp_opt.tstamp_ok)
+ af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+
if (!want_cookie && !isn) {
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
@@ -6365,7 +6345,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
- if (tcp_death_row.sysctl_tw_recycle) {
+ if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
bool strict;
dst = af_ops->route_req(sk, &fl, req, &strict);
@@ -6379,8 +6359,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
/* Kill the following clause, if you dislike this way. */
else if (!net->ipv4.sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (sysctl_max_syn_backlog >> 2)) &&
+ (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
tmp_opt.saw_tstamp)) {
/* Without syncookies last quarter of
@@ -6395,7 +6375,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_release;
}
- isn = af_ops->init_seq(skb);
+ isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
}
if (!dst) {
dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6407,6 +6387,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ tcp_rsk(req)->ts_off = 0;
req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2259114c7242..575e19dcc017 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,7 +84,6 @@
#include <crypto/hash.h>
#include <linux/scatterlist.h>
-int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
#ifdef CONFIG_TCP_MD5SIG
@@ -95,12 +94,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -120,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
+ (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (tp->write_seq == 0)
@@ -146,7 +145,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct flowi4 *fl4;
struct rtable *rt;
int err;
+ u32 seq;
struct ip_options_rcu *inet_opt;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -197,7 +198,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
tcp_fetch_timewait_stamp(sk, &rt->dst);
@@ -216,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
- err = inet_hash_connect(&tcp_death_row, sk);
+ err = inet_hash_connect(tcp_death_row, sk);
if (err)
goto failure;
@@ -232,18 +233,27 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
+ rt = NULL;
- if (!tp->write_seq && likely(!tp->repair))
- tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port);
+ if (likely(!tp->repair)) {
+ seq = secure_tcp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port,
+ &tp->tsoffset);
+ if (!tp->write_seq)
+ tp->write_seq = seq;
+ }
inet->inet_id = tp->write_seq ^ jiffies;
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto failure;
+
err = tcp_connect(sk);
- rt = NULL;
if (err)
goto failure;
@@ -269,10 +279,13 @@ EXPORT_SYMBOL(tcp_v4_connect);
*/
void tcp_v4_mtu_reduced(struct sock *sk)
{
- struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
- u32 mtu = tcp_sk(sk)->mtu_info;
+ struct dst_entry *dst;
+ u32 mtu;
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ return;
+ mtu = tcp_sk(sk)->mtu_info;
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -418,7 +431,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
switch (type) {
case ICMP_REDIRECT:
- do_redirect(icmp_skb, sk);
+ if (!sock_owned_by_user(sk))
+ do_redirect(icmp_skb, sk);
goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
@@ -442,7 +456,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
@@ -691,6 +705,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -711,7 +726,7 @@ out:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct net *net,
+static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
@@ -726,6 +741,7 @@ static void tcp_v4_send_ack(struct net *net,
#endif
];
} rep;
+ struct net *net = sock_net(sk);
struct ip_reply_arg arg;
memset(&rep.th, 0, sizeof(struct tcphdr));
@@ -775,6 +791,7 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -790,7 +807,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(sock_net(sk), skb,
+ tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
@@ -818,10 +835,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
- tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -1315,10 +1332,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric_advmss(dst);
- if (tcp_sk(sk)->rx_opt.user_mss &&
- tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
- newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+ newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -1552,8 +1566,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
* It has been noticed pure SACK packets were sometimes dropped
* (if cooked by drivers without copybreak feature).
*/
- if (!skb->data_len)
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ skb_condense(skb);
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -1813,7 +1826,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
- .bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
@@ -1884,9 +1896,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_free_fastopen_req(tp);
tcp_saved_syn_free(tp);
- local_bh_disable();
sk_sockets_allocated_dec(sk);
- local_bh_enable();
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -1908,7 +1918,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (!sk) {
get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock_bh(&ilb->lock);
+ spin_lock(&ilb->lock);
sk = sk_head(&ilb->head);
st->offset = 0;
goto get_sk;
@@ -1925,7 +1935,7 @@ get_sk:
if (sk->sk_family == st->family)
return sk;
}
- spin_unlock_bh(&ilb->lock);
+ spin_unlock(&ilb->lock);
st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE)
goto get_head;
@@ -2133,7 +2143,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
@@ -2225,7 +2235,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
@@ -2372,6 +2382,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
@@ -2415,7 +2426,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
static int __net_init tcp_sk_init(struct net *net)
{
- int res, cpu;
+ int res, cpu, cnt;
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.tcp_sk)
@@ -2452,6 +2463,14 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_orphan_retries = 0;
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+ net->ipv4.sysctl_tcp_tw_reuse = 0;
+
+ cnt = tcp_hashinfo.ehash_mask + 1;
+ net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+ net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
return 0;
fail:
@@ -2462,7 +2481,7 @@ fail:
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
- inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+ inet_twsk_purge(&tcp_hashinfo, AF_INET);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2473,7 +2492,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
void __init tcp_v4_init(void)
{
- inet_hashinfo_init(&tcp_hashinfo);
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c67ece1390c2..046fd3910873 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
static struct tcp_congestion_ops tcp_lp __read_mostly = {
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_lp_cong_avoid,
.pkts_acked = tcp_lp_pkts_acked,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bf1f3b2b29d1..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
u32 val;
int m;
+ sk_dst_confirm(sk);
if (sysctl_tcp_nometrics_save || !dst)
return;
- if (dst->flags & DST_HOST)
- dst_confirm(dst);
-
rcu_read_lock();
if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
+ sk_dst_confirm(sk);
if (!dst)
goto reset;
- dst_confirm(dst);
-
rcu_read_lock();
tm = tcp_get_metrics(sk, dst, true);
if (!tm) {
@@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk)
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val) {
tcp_disable_fack(tp);
- tcp_disable_early_retrans(tp);
tp->reordering = val;
}
@@ -606,7 +602,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
return ret;
}
-EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
{
@@ -742,14 +737,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
rcu_read_unlock();
}
-static struct genl_family tcp_metrics_nl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = TCP_METRICS_GENL_NAME,
- .version = TCP_METRICS_GENL_VERSION,
- .maxattr = TCP_METRICS_ATTR_MAX,
- .netnsok = true,
-};
+static struct genl_family tcp_metrics_nl_family;
static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
[TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
@@ -1116,6 +1104,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
},
};
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = tcp_metrics_nl_ops,
+ .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+};
+
static unsigned int tcpmhash_entries;
static int __init set_tcpmhash_entries(char *str)
{
@@ -1179,8 +1178,7 @@ void __init tcp_metrics_init(void)
if (ret < 0)
panic("Could not allocate the tcp_metrics hash table\n");
- ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
- tcp_metrics_nl_ops);
+ ret = genl_register_family(&tcp_metrics_nl_family);
if (ret < 0)
panic("Could not register tcp_metrics generic netlink\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6234ebaa7db1..65c0f3d13eca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
int sysctl_tcp_abort_on_overflow __read_mostly;
-struct inet_timewait_death_row tcp_death_row = {
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .hashinfo = &tcp_hashinfo,
-};
-EXPORT_SYMBOL_GPL(tcp_death_row);
-
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
@@ -100,13 +94,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
- tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -153,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw))
inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +260,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
bool recycle_ok = false;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
- if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
- tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+ tw = inet_twsk_alloc(sk, tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -364,15 +361,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk_listener);
- u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
int full_space = tcp_full_space(sk_listener);
- int mss = dst_metric_advmss(dst);
u32 window_clamp;
__u8 rcv_wscale;
+ int mss;
- if (user_mss && user_mss < mss)
- mss = user_mss;
-
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
@@ -466,13 +460,13 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_ack.lrcvtime = tcp_time_stamp;
newtp->packets_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack.stamp_jiffies;
newsk->sk_txhash = treq->txhash;
@@ -532,7 +526,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
- newtp->tsoffset = 0;
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -581,6 +575,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 896e9dfbdb5c..c3c082ed3879 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
- }
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
}
-/* SND.NXT, if window was not shrunk.
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
@@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+ (tp->rx_opt.wscale_ok &&
+ ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
return tp->snd_nxt;
else
return tcp_wnd_end(tp);
@@ -640,7 +641,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcp_skb_timestamp(skb);
+ opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -769,25 +770,27 @@ static void tcp_tasklet_func(unsigned long data)
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
- bh_lock_sock(sk);
-
- if (!sock_owned_by_user(sk)) {
- tcp_tsq_handler(sk);
- } else {
- /* defer the work to tcp_release_cb() */
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ smp_mb__before_atomic();
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+ if (!sk->sk_lock.owned &&
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+ tcp_tsq_handler(sk);
+ }
+ bh_unlock_sock(sk);
}
- bh_unlock_sock(sk);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
- (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -797,18 +800,17 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = tp->tsq_flags;
+ flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & (1UL << TCP_TSQ_DEFERRED))
+ if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
@@ -822,15 +824,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
@@ -860,6 +862,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
@@ -877,16 +880,25 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
- unsigned long flags;
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
- tasklet_schedule(&tsq->tasklet);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}
@@ -955,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+ /* If we had to use memory reserve to allocate this skb,
+ * this might cause drops if packet is looped back :
+ * Other socket might not have SOCK_MEMALLOC.
+ * Packets not looped back do not care about pfmemalloc.
+ */
+ skb->pfmemalloc = 0;
+
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -964,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
@@ -1027,7 +1048,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Our usage of tstamp should remain private */
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1514,6 +1535,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
+
+ /* The following conditions together indicate the starvation
+ * is caused by insufficient sender buffer:
+ * 1) just sent some data (see tcp_write_xmit)
+ * 2) not cwnd limited (this else condition)
+ * 3) no more data to send (null tcp_send_head )
+ * 4) application is hitting buffer limit (SOCK_NOSPACE)
+ */
+ if (!tcp_send_head(sk) && sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
+ (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -1910,26 +1943,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
*/
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
- int len;
int probe_size;
int size_needed;
- int copy;
+ int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
@@ -2069,7 +2102,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* Always send the 1st or 2nd skb in write queue.
+ * No need to wait for TX completion to call us back,
+ * after softirq/tasklet schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (skb == sk->sk_write_queue.next ||
+ skb->prev == sk->sk_write_queue.next)
+ return false;
+
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
@@ -2081,6 +2123,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
return false;
}
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_time_stamp;
+
+ if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+ tp->chrono_start = now;
+ tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+
+ /* There are multiple conditions worthy of tracking in a
+ * chronograph, so that the highest priority enum takes
+ * precedence over the other conditions (see tcp_chrono_start).
+ * If a condition stops, we only stop chrono tracking if
+ * it's the "most interesting" or current chrono we are
+ * tracking and starts busy chrono if we have pending data.
+ */
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+ else if (type == tp->chrono_type)
+ tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2103,7 +2186,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
- bool is_cwnd_limited = false;
+ bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
@@ -2140,8 +2223,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ is_rwnd_limited = true;
break;
+ }
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -2167,6 +2252,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
@@ -2186,6 +2273,11 @@ repair:
break;
}
+ if (is_rwnd_limited)
+ tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+ else
+ tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2207,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
- if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
- return false;
/* No consecutive loss probes. */
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
tcp_rearm_rto(sk);
@@ -2227,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
- !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+ !tp->packets_out || !tcp_is_sack(tp) ||
+ icsk->icsk_ca_state != TCP_CA_Open)
return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2436,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk)
int full_space = min_t(int, tp->window_clamp, allowed_space);
int window;
- if (mss > full_space)
+ if (unlikely(mss > full_space)) {
mss = full_space;
-
+ if (mss <= 0)
+ return 0;
+ }
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
@@ -2514,7 +2607,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
}
/* Collapses two adjacent SKB's during retransmission. */
-static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2525,13 +2618,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+ if (next_skb_size) {
+ if (next_skb_size <= skb_availroom(skb))
+ skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+ next_skb_size);
+ else if (!skb_shift(skb, next_skb, next_skb_size))
+ return false;
+ }
tcp_highest_sack_combine(sk, next_skb, skb);
tcp_unlink_write_queue(next_skb, sk);
- skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
- next_skb_size);
-
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2560,6 +2657,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_skb_collapse_tstamp(skb, next_skb);
sk_wmem_free_skb(sk, next_skb);
+ return true;
}
/* Check if coalescing SKBs is legal. */
@@ -2567,14 +2665,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
return false;
- /* TODO: SACK collapsing could be used to remove this condition */
- if (skb_shinfo(skb)->nr_frags != 0)
- return false;
if (skb_cloned(skb))
return false;
if (skb == tcp_send_head(sk))
return false;
- /* Some heurestics for collapsing over SACK'd could be invented */
+ /* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -2612,16 +2707,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (space < 0)
break;
- /* Punt if not enough space exists in the first SKB for
- * the data in the second
- */
- if (skb->len > skb_availroom(to))
- break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
- tcp_collapse_retrans(sk, to);
+ if (!tcp_collapse_retrans(sk, to))
+ break;
}
}
@@ -2694,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
+ /* Update global and local TCP statistics. */
+ segs = tcp_skb_pcount(skb);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
+
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
@@ -2711,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
}
if (likely(!err)) {
- segs = tcp_skb_pcount(skb);
-
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
- /* Update global TCP statistics. */
- TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans += segs;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
return err;
}
@@ -2741,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- } else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2751,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
return err;
}
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
-
- /* Forward retransmissions are possible only during Recovery. */
- if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return false;
-
- /* No forward retransmissions in Reno are possible. */
- if (tcp_is_reno(tp))
- return false;
-
- /* Yeah, we have to make difficult choice between forward transmission
- * and retransmission... Both ways have their merits...
- *
- * For now we do not retransmit anything, while we have some new
- * segments to send. In the other cases, follow rule 3 for
- * NextSeg() specified in RFC3517.
- */
-
- if (tcp_may_send_now(sk))
- return false;
-
- return true;
-}
-
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
@@ -2795,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 max_segs, last_lost;
+ u32 max_segs;
int mib_idx;
- int fwd_rexmitting = 0;
if (!tp->packets_out)
return;
- if (!tp->lost_out)
- tp->retransmit_high = tp->snd_una;
-
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
- last_lost = TCP_SKB_CB(skb)->end_seq;
- if (after(last_lost, tp->retransmit_high))
- last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
- last_lost = tp->snd_una;
}
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2835,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/
segs = min_t(int, segs, max_segs);
- if (fwd_rexmitting) {
-begin_fwd:
- if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
- break;
- mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
- } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
- tp->retransmit_high = last_lost;
- if (!tcp_can_forward_retransmit(sk))
- break;
- /* Backtrack if necessary to non-L'ed skb */
- if (hole) {
- skb = hole;
- hole = NULL;
- }
- fwd_rexmitting = 1;
- goto begin_fwd;
-
+ if (tp->retrans_out >= tp->lost_out) {
+ break;
} else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
} else {
- last_lost = TCP_SKB_CB(skb)->end_seq;
if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS;
else
@@ -2880,7 +2916,8 @@ begin_fwd:
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk))
+ if (skb == tcp_write_queue_head(sk) &&
+ icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
@@ -2962,6 +2999,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
struct sk_buff *skb;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+
/* NOTE: No TCP options attached and we never retransmit this. */
skb = alloc_skb(MAX_TCP_HEADER, priority);
if (!skb) {
@@ -2977,8 +3016,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Send it off. */
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
-
- TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
}
/* Send a crossed SYN-ACK during socket establishment.
@@ -3037,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct sk_buff *skb;
int tcp_header_size;
struct tcphdr *th;
- u16 user_mss;
int mss;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3067,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
}
skb_dst_set(skb, dst);
- mss = dst_metric_advmss(dst);
- user_mss = READ_ONCE(tp->rx_opt.user_mss);
- if (user_mss && user_mss < mss)
- mss = user_mss;
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -3123,7 +3156,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
#endif
/* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp.tv64 = 0;
+ skb->tstamp = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
@@ -3176,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk)
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
- tp->advmss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
- tp->advmss = tp->rx_opt.user_mss;
+ tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3244,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0;
- unsigned long last_syn_loss = 0;
+ int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
- &syn_loss, &last_syn_loss);
- /* Recurring FO SYN losses: revert to regular handshake temporarily */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- fo->cookie.len = -1;
- goto fallback;
- }
-
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
- fo->cookie.len = -1;
- else if (fo->cookie.len <= 0)
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
* user-MSS. Reserve maximum option space for middleboxes that add
* private TCP options. The cost is reduced data space in SYN :(
*/
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
@@ -3300,6 +3319,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
@@ -3464,8 +3485,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
- * We also avoid tcp_wfree() overhead (cache line miss accessing
- * tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..d8acbd9f477a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,9 +1,33 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+ tcp_skb_pcount(skb));
+ }
+}
+
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+ const struct skb_mstamp *t2,
+ u32 seq1, u32 seq2)
+{
+ return skb_mstamp_after(t1, t2) ||
+ (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics:
*
@@ -16,31 +40,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
* is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh.
*
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
*/
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+ u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- u32 reo_wnd, prior_retrans = tp->retrans_out;
-
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
- return 0;
-
- /* Reset the advanced flag to avoid unnecessary queue scanning */
- tp->rack.advanced = 0;
+ u32 reo_wnd;
+ *reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
- *
- * TODO: measure and adapt to the observed reordering delay, and
- * use a timer to retransmit like the delayed early retransmit.
*/
reo_wnd = 1000;
- if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) {
@@ -54,20 +73,29 @@ int tcp_rack_mark_lost(struct sock *sk)
scb->sacked & TCPCB_SACKED_ACKED)
continue;
- if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+ if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+ tp->rack.end_seq, scb->end_seq)) {
+ /* Step 3 in draft-cheng-tcpm-rack-00.txt:
+ * A packet is lost if its elapsed time is beyond
+ * the recent RTT plus the reordering window.
+ */
+ u32 elapsed = skb_mstamp_us_delta(now,
+ &skb->skb_mstamp);
+ s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
- if (skb_mstamp_us_delta(&tp->rack.mstamp,
- &skb->skb_mstamp) <= reo_wnd)
+ if (remaining < 0) {
+ tcp_rack_mark_skb_lost(sk, skb);
continue;
-
- /* skb is lost if packet sent later is sacked */
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- if (scb->sacked & TCPCB_SACKED_RETRANS) {
- scb->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
}
+
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ /* Record maximum wait time (+1 to avoid 0) */
+ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
@@ -75,20 +103,43 @@ int tcp_rack_mark_lost(struct sock *sk)
break;
}
}
- return prior_retrans - tp->retrans_out;
}
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
- const struct skb_mstamp *xmit_time, u8 sacked)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout;
+
+ if (!tp->rack.advanced)
+ return;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+ tcp_rack_detect_loss(sk, now, &timeout);
+ if (timeout) {
+ timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+ timeout, inet_csk(sk)->icsk_rto);
+ }
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
+ const struct skb_mstamp *xmit_time,
+ const struct skb_mstamp *ack_time)
{
+ u32 rtt_us;
+
if (tp->rack.mstamp.v64 &&
- !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+ end_seq, tp->rack.end_seq))
return;
+ rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
if (sacked & TCPCB_RETRANS) {
- struct skb_mstamp now;
-
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
@@ -99,11 +150,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
- skb_mstamp_get(&now);
- if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ if (rtt_us < tcp_min_rtt(tp))
return;
}
-
+ tp->rack.rtt_us = rtt_us;
tp->rack.mstamp = *xmit_time;
+ tp->rack.end_seq = end_seq;
tp->rack.advanced = 1;
}
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ u32 timeout, prior_inflight;
+
+ skb_mstamp_get(&now);
+ prior_inflight = tcp_packets_in_flight(tp);
+ tcp_rack_detect_loss(sk, &now, &timeout);
+ if (prior_inflight != tcp_packets_in_flight(tp)) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+ tcp_enter_recovery(sk, false);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_cwnd_reduction(sk, 1, 0);
+ }
+ tcp_xmit_retransmit_queue(sk);
+ }
+ if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+ tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index bf5ea9e9bbc1..f2123075ce6e 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,6 +15,10 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
+struct scalable {
+ u32 loss_cwnd;
+};
+
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ struct scalable *ca = inet_csk_ca(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
+static u32 tcp_scalable_cwnd_undo(struct sock *sk)
+{
+ const struct scalable *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
+ .undo_cwnd = tcp_scalable_cwnd_undo,
.cong_avoid = tcp_scalable_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..b2ab411c6d37 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct sock *sk)
sk_mem_reclaim_partial(sk);
- if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
@@ -310,7 +311,7 @@ static void tcp_delack_timer(unsigned long data)
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
- if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ !icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
@@ -563,8 +565,8 @@ void tcp_write_timer_handler(struct sock *sk)
event = icsk->icsk_pending;
switch (event) {
- case ICSK_TIME_EARLY_RETRANS:
- tcp_resume_early_retransmit(sk);
+ case ICSK_TIME_REO_TIMEOUT:
+ tcp_rack_reo_timeout(sk);
break;
case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk);
@@ -592,7 +594,7 @@ static void tcp_write_timer(unsigned long data)
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -617,6 +619,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
else if (!val)
inet_csk_delete_keepalive_timer(sk);
}
+EXPORT_SYMBOL_GPL(tcp_set_keepalive);
static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 4c4bac1b5eab..218cfcc77650 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_vegas_cong_avoid,
.pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 40171e163cff..76005d4b8dfc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,6 +30,7 @@ struct veno {
u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
u32 inc; /* decide whether to increase cwnd */
u32 diff; /* calculate the diff rate */
+ u32 loss_cwnd; /* cwnd when loss occured */
};
/* There are several situations when we must "re-start" Veno:
@@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
+ veno->loss_cwnd = tp->snd_cwnd;
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
+static u32 tcp_veno_cwnd_undo(struct sock *sk)
+{
+ const struct veno *veno = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
+ .undo_cwnd = tcp_veno_cwnd_undo,
.cong_avoid = tcp_veno_cong_avoid,
.pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4b03a2e2a050..fed66dc0e0f5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = tcp_westwood_event,
.in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9c5fc973267f..e6ff99c4bd3b 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,6 +37,7 @@ struct yeah {
u32 fast_count;
u32 pkts_acked;
+ u32 loss_cwnd;
};
static void tcp_yeah_init(struct sock *sk)
@@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
+ yeah->loss_cwnd = tp->snd_cwnd;
return max_t(int, tp->snd_cwnd - reduction, 2);
}
+static u32 tcp_yeah_cwnd_undo(struct sock *sk)
+{
+ const struct yeah *yeah = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
+ .undo_cwnd = tcp_yeah_cwnd_undo,
.cong_avoid = tcp_yeah_cong_avoid,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5bab6c3f7a2f..ea6e4cff9faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -79,7 +79,7 @@
#define pr_fmt(fmt) "UDP: " fmt
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
@@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+/* IPCB reference means this can not be used from early demux */
+static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+ skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+ return true;
+#endif
+ return false;
+}
+
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard),
- unsigned int log)
+ struct sock *sk, unsigned int log)
{
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
@@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (!sk2->sk_reuseport || !sk->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2, true)) {
- if (!bitmap)
- return 1;
- __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sock_i_uid(sk2))) {
+ if (!bitmap)
+ return 0;
+ } else {
+ if (!bitmap)
+ return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log,
+ bitmap);
+ }
}
}
return 0;
@@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
*/
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+ struct sock *sk)
{
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
@@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (!sk2->sk_reuseport || !sk->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2, true)) {
- res = 1;
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sock_i_uid(sk2))) {
+ res = 0;
+ } else {
+ res = 1;
+ }
break;
}
}
@@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
return res;
}
-static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
struct net *net = sock_net(sk);
kuid_t uid = sock_i_uid(sk);
@@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
(udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
- (*saddr_same)(sk, sk2, false)) {
+ inet_rcv_saddr_equal(sk, sk2, false)) {
return reuseport_add_sock(sk, sk2);
}
}
@@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
*
* @sk: socket struct in question
* @snum: port number to look up
- * @saddr_comp: AF-dependent comparison of bound local IP addresses
* @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
* with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
@@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
bitmap_zero(bitmap, PORTS_PER_CHAIN);
spin_lock_bh(&hslot->lock);
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
- saddr_comp, udptable->log);
+ udptable->log);
snum = first;
/*
@@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
snum += rand;
} while (snum != first);
spin_unlock_bh(&hslot->lock);
+ cond_resched();
} while (++first != last);
goto fail;
} else {
@@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
if (hslot->count < hslot2->count)
goto scan_primary_hash;
- exist = udp_lib_lport_inuse2(net, snum, hslot2,
- sk, saddr_comp);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
if (!exist && (hash2_nulladdr != slot2)) {
hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
exist = udp_lib_lport_inuse2(net, snum, hslot2,
- sk, saddr_comp);
+ sk);
}
if (exist)
goto fail_unlock;
@@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
goto found;
}
scan_primary_hash:
- if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
- saddr_comp, 0))
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
goto fail_unlock;
}
found:
@@ -324,7 +328,7 @@ found:
udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
if (sk->sk_reuseport &&
- udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+ udp_reuseport_add_sock(sk, hslot)) {
inet_sk(sk)->inet_num = 0;
udp_sk(sk)->udp_port_hash = 0;
udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -356,24 +360,6 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- * 0.0.0.0 only equals to 0.0.0.0
- */
-int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
- bool match_wildcard)
-{
- struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
-
- if (!ipv6_only_sock(sk2)) {
- if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
- return 1;
- if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
- return match_wildcard;
- }
- return 0;
-}
-
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
unsigned int port)
{
@@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
- return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+ return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif)
+ __be32 daddr, unsigned short hnum, int dif,
+ bool exact_dif)
{
int score;
struct inet_sock *inet;
@@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if || exact_dif) {
if (sk->sk_bound_dev_if != dif)
return -1;
score += 4;
@@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif,
+ __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
@@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ bool exact_dif = udp_lib_exact_dif_match(net, skb);
int score, badness, matches = 0, reuseport = 0;
u32 hash = 0;
@@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2, skb);
}
return result;
}
@@ -533,7 +521,7 @@ begin:
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -580,7 +568,8 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
+ IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
@@ -1019,7 +1008,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
- faddr, saddr, dport, inet->inet_sport);
+ faddr, saddr, dport, inet->inet_sport,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1111,7 +1101,8 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1172,6 +1163,181 @@ out:
return ret;
}
+/* fully reclaim rmem/fwd memory allocated for skb */
+static void udp_rmem_release(struct sock *sk, int size, int partial)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int amt;
+
+ if (likely(partial)) {
+ up->forward_deficit += size;
+ size = up->forward_deficit;
+ if (size < (sk->sk_rcvbuf >> 2) &&
+ !skb_queue_empty(&sk->sk_receive_queue))
+ return;
+ } else {
+ size += up->forward_deficit;
+ }
+ up->forward_deficit = 0;
+
+ sk->sk_forward_alloc += size;
+ amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
+ sk->sk_forward_alloc -= amt;
+
+ if (amt)
+ __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+
+ atomic_sub(size, &sk->sk_rmem_alloc);
+}
+
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
+{
+ udp_rmem_release(sk, skb->dev_scratch, 1);
+}
+EXPORT_SYMBOL(udp_skb_destructor);
+
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+ spinlock_t *busy;
+
+ busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+ spin_lock(busy);
+ return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+ if (busy)
+ spin_unlock(busy);
+}
+
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+ int rmem, delta, amt, err = -ENOMEM;
+ spinlock_t *busy = NULL;
+ int size;
+
+ /* try to avoid the costly atomic add/sub pair when the receive
+ * queue is full; always allow at least a packet
+ */
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ if (rmem > sk->sk_rcvbuf)
+ goto drop;
+
+ /* Under mem pressure, it might be helpful to help udp_recvmsg()
+ * having linear skbs :
+ * - Reduce memory overhead and thus increase receive queue capacity
+ * - Less cache line misses at copyout() time
+ * - Less work at consume_skb() (less alien page frag freeing)
+ */
+ if (rmem > (sk->sk_rcvbuf >> 1)) {
+ skb_condense(skb);
+
+ busy = busylock_acquire(sk);
+ }
+ size = skb->truesize;
+ /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+ * in udp_skb_destructor()
+ */
+ skb->dev_scratch = size;
+
+ /* we drop only if the receive buf is full and the receive
+ * queue contains some other skb
+ */
+ rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
+ if (rmem > (size + sk->sk_rcvbuf))
+ goto uncharge_drop;
+
+ spin_lock(&list->lock);
+ if (size >= sk->sk_forward_alloc) {
+ amt = sk_mem_pages(size);
+ delta = amt << SK_MEM_QUANTUM_SHIFT;
+ if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
+ err = -ENOBUFS;
+ spin_unlock(&list->lock);
+ goto uncharge_drop;
+ }
+
+ sk->sk_forward_alloc += delta;
+ }
+
+ sk->sk_forward_alloc -= size;
+
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
+ */
+ sock_skb_set_dropcount(sk, skb);
+
+ __skb_queue_tail(list, skb);
+ spin_unlock(&list->lock);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ busylock_release(busy);
+ return 0;
+
+uncharge_drop:
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+
+drop:
+ atomic_inc(&sk->sk_drops);
+ busylock_release(busy);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
+
+void udp_destruct_sock(struct sock *sk)
+{
+ /* reclaim completely the forward allocated memory */
+ unsigned int total = 0;
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ total += skb->truesize;
+ kfree_skb(skb);
+ }
+ udp_rmem_release(sk, total, 0);
+
+ inet_sock_destruct(sk);
+}
+EXPORT_SYMBOL_GPL(udp_destruct_sock);
+
+int udp_init_sock(struct sock *sk)
+{
+ sk->sk_destruct = udp_destruct_sock;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_init_sock);
+
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+{
+ if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
+ bool slow = lock_sock_fast(sk);
+
+ sk_peek_offset_bwd(sk, len);
+ unlock_sock_fast(sk, slow);
+ }
+ consume_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_consume_udp);
+
/**
* first_packet_length - return length of first packet in receive queue
* @sk: socket
@@ -1181,12 +1347,11 @@ out:
*/
static int first_packet_length(struct sock *sk)
{
- struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
struct sk_buff *skb;
+ int total = 0;
int res;
- __skb_queue_head_init(&list_kill);
-
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
@@ -1196,18 +1361,13 @@ static int first_packet_length(struct sock *sk)
IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
- __skb_queue_tail(&list_kill, skb);
+ total += skb->truesize;
+ kfree_skb(skb);
}
res = skb ? skb->len : -1;
+ if (total)
+ udp_rmem_release(sk, total, 1);
spin_unlock_bh(&rcvq->lock);
-
- if (!skb_queue_empty(&list_kill)) {
- bool slow = lock_sock_fast(sk);
-
- __skb_queue_purge(&list_kill);
- sk_mem_reclaim_partial(sk);
- unlock_sock_fast(sk, slow);
- }
return res;
}
@@ -1256,15 +1416,13 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
- bool slow;
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len);
try_again:
peeking = off = sk_peek_offset(sk, flags);
- skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
@@ -1281,7 +1439,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -1297,13 +1456,12 @@ try_again:
}
if (unlikely(err)) {
- trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
- skb_free_datagram_locked(sk, skb);
+ kfree_skb(skb);
return err;
}
@@ -1322,22 +1480,21 @@ try_again:
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off);
+ ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
if (flags & MSG_TRUNC)
err = ulen;
- __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+ skb_consume_udp(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
- slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags)) {
+ if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
- unlock_sock_fast(sk, slow);
+ kfree_skb(skb);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -1463,9 +1620,11 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
+ } else {
+ sk_mark_napi_id_once(sk, skb);
}
- rc = __sock_queue_rcv_skb(sk, skb);
+ rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1480,7 +1639,6 @@ int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
return 0;
-
}
static struct static_key udp_encap_needed __read_mostly;
@@ -1502,7 +1660,6 @@ EXPORT_SYMBOL(udp_encap_enable);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
/*
@@ -1589,25 +1746,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
udp_csum_pull_header(skb);
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
- goto drop;
- }
-
- rc = 0;
ipv4_pktinfo_prepare(sk, skb);
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
- bh_unlock_sock(sk);
- goto drop;
- }
- bh_unlock_sock(sk);
-
- return rc;
+ return __udp_queue_rcv_skb(sk, skb);
csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -2217,13 +2358,13 @@ struct proto udp_prot = {
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
+ .init = udp_init_sock,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
@@ -2512,6 +2653,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
void __init udp_init(void)
{
unsigned long limit;
+ unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -2522,4 +2664,13 @@ void __init udp_init(void)
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+ /* 16 spinlocks per cpu */
+ udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+ udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+ GFP_KERNEL);
+ if (!udp_busylocks)
+ panic("UDP: failed to alloc udp_busylocks\n");
+ for (i = 0; i < (1U << udp_busylocks_log); i++)
+ spin_lock_init(udp_busylocks + i);
}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ff450c2aad9b..59f10fe9782e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -50,10 +50,11 @@ struct proto udplite_prot = {
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72db461..1fc684111ce6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -40,6 +40,7 @@ drop:
int xfrm4_transport_finish(struct sk_buff *skb, int async)
{
+ struct xfrm_offload *xo = xfrm_offload(skb);
struct iphdr *iph = ip_hdr(skb);
iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
@@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
+ if (xo && (xo->flags & XFRM_GRO)) {
+ skb_mac_header_rebuild(skb);
+ return 0;
+ }
+
NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7d75ea..4acc0508c5eb 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ihl = skb->data - skb_transport_header(skb);
+ struct xfrm_offload *xo = xfrm_offload(skb);
if (skb->transport_header != skb->network_header) {
memmove(skb_transport_header(skb),
@@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
skb->network_header = skb->transport_header;
}
ip_hdr(skb)->tot_len = htons(skb->len + ihl);
- skb_reset_transport_header(skb);
+ if (!xo || !(xo->flags & XFRM_GRO))
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6a7ff6957535..71b4ecc195c7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
#include <net/ip.h>
#include <net/l3mdev.h>
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-
static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
int tos, int oif,
const xfrm_address_t *saddr,
@@ -219,7 +217,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
- xfrm4_policy_afinfo.garbage_collect(net);
+ xfrm_garbage_collect_deferred(net);
return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
@@ -271,8 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
.gc_thresh = INT_MAX,
};
-static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
- .family = AF_INET,
+static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
@@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = {
static void __init xfrm4_policy_init(void)
{
- xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
}
void __init xfrm4_init(void)
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index dccefa9d84cf..8dd0e6ab8606 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = {
.netns_ok = 1,
};
-static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
.family = AF_INET,
- .owner = THIS_MODULE,
.callback = xfrm4_rcv_cb,
};
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
{
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
-
-#if 0
-void __exit xfrm4_state_fini(void)
-{
- xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
-}
-#endif /* 0 */
-