summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-04-01 03:29:33 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-04-01 03:29:33 +0300
commit29d9f30d4ce6c7a38745a54a8cddface10013490 (patch)
tree85649ba6a7b39203584d8db9365e03f64e62c136 /net/ipv4
parent56a451b780676bc1cdac011735fe2869fa2e9abf (diff)
parent7f80ccfe996871ca69648efee74a60ae7ad0dcd9 (diff)
downloadlinux-29d9f30d4ce6c7a38745a54a8cddface10013490.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller: "Highlights: 1) Fix the iwlwifi regression, from Johannes Berg. 2) Support BSS coloring and 802.11 encapsulation offloading in hardware, from John Crispin. 3) Fix some potential Spectre issues in qtnfmac, from Sergey Matyukevich. 4) Add TTL decrement action to openvswitch, from Matteo Croce. 5) Allow paralleization through flow_action setup by not taking the RTNL mutex, from Vlad Buslov. 6) A lot of zero-length array to flexible-array conversions, from Gustavo A. R. Silva. 7) Align XDP statistics names across several drivers for consistency, from Lorenzo Bianconi. 8) Add various pieces of infrastructure for offloading conntrack, and make use of it in mlx5 driver, from Paul Blakey. 9) Allow using listening sockets in BPF sockmap, from Jakub Sitnicki. 10) Lots of parallelization improvements during configuration changes in mlxsw driver, from Ido Schimmel. 11) Add support to devlink for generic packet traps, which report packets dropped during ACL processing. And use them in mlxsw driver. From Jiri Pirko. 12) Support bcmgenet on ACPI, from Jeremy Linton. 13) Make BPF compatible with RT, from Thomas Gleixnet, Alexei Starovoitov, and your's truly. 14) Support XDP meta-data in virtio_net, from Yuya Kusakabe. 15) Fix sysfs permissions when network devices change namespaces, from Christian Brauner. 16) Add a flags element to ethtool_ops so that drivers can more simply indicate which coalescing parameters they actually support, and therefore the generic layer can validate the user's ethtool request. Use this in all drivers, from Jakub Kicinski. 17) Offload FIFO qdisc in mlxsw, from Petr Machata. 18) Support UDP sockets in sockmap, from Lorenz Bauer. 19) Fix stretch ACK bugs in several TCP congestion control modules, from Pengcheng Yang. 20) Support virtual functiosn in octeontx2 driver, from Tomasz Duszynski. 21) Add region operations for devlink and use it in ice driver to dump NVM contents, from Jacob Keller. 22) Add support for hw offload of MACSEC, from Antoine Tenart. 23) Add support for BPF programs that can be attached to LSM hooks, from KP Singh. 24) Support for multiple paths, path managers, and counters in MPTCP. From Peter Krystad, Paolo Abeni, Florian Westphal, Davide Caratti, and others. 25) More progress on adding the netlink interface to ethtool, from Michal Kubecek" * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2121 commits) net: ipv6: rpl_iptunnel: Fix potential memory leak in rpl_do_srh_inline cxgb4/chcr: nic-tls stats in ethtool net: dsa: fix oops while probing Marvell DSA switches net/bpfilter: remove superfluous testing message net: macb: Fix handling of fixed-link node net: dsa: ksz: Select KSZ protocol tag netdevsim: dev: Fix memory leak in nsim_dev_take_snapshot_write net: stmmac: add EHL 2.5Gbps PCI info and PCI ID net: stmmac: add EHL PSE0 & PSE1 1Gbps PCI info and PCI ID net: stmmac: create dwmac-intel.c to contain all Intel platform net: dsa: bcm_sf2: Support specifying VLAN tag egress rule net: dsa: bcm_sf2: Add support for matching VLAN TCI net: dsa: bcm_sf2: Move writing of CFP_DATA(5) into slicing functions net: dsa: bcm_sf2: Check earlier for FLOW_EXT and FLOW_MAC_EXT net: dsa: bcm_sf2: Disable learning for ASP port net: dsa: b53: Deny enslaving port 7 for 7278 into a bridge net: dsa: b53: Prevent tagged VLAN on port 7 for 7278 net: dsa: b53: Restore VLAN entries upon (re)configuration net: dsa: bcm_sf2: Fix overflow checks hv_netvsc: Remove unnecessary round_up for recv_completion_cnt ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/bpf_tcp_ca.c33
-rw-r--r--net/ipv4/devinet.c6
-rw-r--r--net/ipv4/esp4.c16
-rw-r--r--net/ipv4/esp4_offload.c32
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_semantics.c26
-rw-r--r--net/ipv4/fib_trie.c10
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c36
-rw-r--r--net/ipv4/inet_diag.c307
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c4
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c4
-rw-r--r--net/ipv4/nexthop.c2
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/raw_diag.c24
-rw-r--r--net/ipv4/route.c61
-rw-r--r--net/ipv4/sysctl_net_ipv4.c33
-rw-r--r--net/ipv4/tcp.c29
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_bpf.c272
-rw-r--r--net/ipv4/tcp_diag.c8
-rw-r--r--net/ipv4/tcp_input.c6
-rw-r--r--net/ipv4/tcp_ipv4.c10
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_scalable.c17
-rw-r--r--net/ipv4/tcp_ulp.c9
-rw-r--r--net/ipv4/tcp_veno.c47
-rw-r--r--net/ipv4/tcp_yeah.c41
-rw-r--r--net/ipv4/udp.c24
-rw-r--r--net/ipv4/udp_bpf.c53
-rw-r--r--net/ipv4/udp_diag.c41
-rw-r--r--net/ipv4/udp_offload.c1
45 files changed, 705 insertions, 515 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9d97bace13c8..9e1a186a3671 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2fe295432c24..cf58e29cf746 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -872,7 +872,7 @@ int inet_shutdown(struct socket *sock, int how)
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
- /* fall through */
+ fallthrough;
default:
sk->sk_shutdown |= how;
if (sk->sk_prot->shutdown)
@@ -886,7 +886,7 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_LISTEN:
if (!(how & RCV_SHUTDOWN))
break;
- /* fall through */
+ fallthrough;
case TCP_SYN_SENT:
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
free_percpu(net->mib.net_statistics);
free_percpu(net->mib.ip_statistics);
free_percpu(net->mib.tcp_statistics);
+#ifdef CONFIG_MPTCP
+ /* allocated on demand, see mptcp_init_sock() */
+ free_percpu(net->mib.mptcp_statistics);
+#endif
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 974179b3b314..d99e1be94019 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -107,7 +107,7 @@ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
if (optlen < 6)
return -EINVAL;
memcpy(daddr, optptr+optlen-4, 4);
- /* Fall through */
+ fallthrough;
default:
memset(optptr, 0, optlen);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 05eb42f347e8..687971d83b4e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1181,7 +1181,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSARP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
if (err)
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 2bf3abeb1456..e3939f76b024 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -7,6 +7,7 @@
#include <linux/btf.h>
#include <linux/filter.h>
#include <net/tcp.h>
+#include <net/bpf_sk_storage.h>
static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, init),
@@ -27,6 +28,27 @@ static u32 unsupported_ops[] = {
static const struct btf_type *tcp_sock_type;
static u32 tcp_sock_id, sock_id;
+static int btf_sk_storage_get_ids[5];
+static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly;
+
+static int btf_sk_storage_delete_ids[5];
+static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly;
+
+static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids,
+ const struct bpf_func_proto *from)
+{
+ int i;
+
+ *to = *from;
+ to->btf_id = to_btf_ids;
+ for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) {
+ if (to->arg_type[i] == ARG_PTR_TO_SOCKET) {
+ to->arg_type[i] = ARG_PTR_TO_BTF_ID;
+ to->btf_id[i] = tcp_sock_id;
+ }
+ }
+}
+
static int bpf_tcp_ca_init(struct btf *btf)
{
s32 type_id;
@@ -42,6 +64,13 @@ static int bpf_tcp_ca_init(struct btf *btf)
tcp_sock_id = type_id;
tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+ convert_sk_func_proto(&btf_sk_storage_get_proto,
+ btf_sk_storage_get_ids,
+ &bpf_sk_storage_get_proto);
+ convert_sk_func_proto(&btf_sk_storage_delete_proto,
+ btf_sk_storage_delete_ids,
+ &bpf_sk_storage_delete_proto);
+
return 0;
}
@@ -167,6 +196,10 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
switch (func_id) {
case BPF_FUNC_tcp_send_ack:
return &bpf_tcp_send_ack_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &btf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &btf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e4632bd2026d..30fa42f5997d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1566,11 +1566,11 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
}
ip_mc_up(in_dev);
- /* fall through */
+ fallthrough;
case NETDEV_CHANGEADDR:
if (!IN_DEV_ARP_NOTIFY(in_dev))
break;
- /* fall through */
+ fallthrough;
case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
inetdev_send_gratuitous_arp(dev, in_dev);
@@ -1588,7 +1588,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (inetdev_valid_mtu(dev->mtu))
break;
/* disable IP when MTU is not enough */
- /* fall through */
+ fallthrough;
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 103c7d599a3c..8b07f3a4f2db 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -341,22 +341,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
-{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
-}
-
static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
int encap_type,
struct esp_info *esp,
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index e2e219c7854a..731022cff600 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -132,6 +132,36 @@ static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
return segs;
}
+static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ int proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else if (x->sel.family != AF_INET6) {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ } else if (proto == IPPROTO_TCP) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
@@ -141,6 +171,8 @@ static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
return xfrm4_tunnel_gso_segment(x, skb, features);
case XFRM_MODE_TRANSPORT:
return xfrm4_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm4_beet_gso_segment(x, skb, features);
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c092e9a55790..818916b2a04d 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -35,7 +35,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack);
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a803cdd9400a..6ed8c9317179 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -570,8 +570,9 @@ static int fib_detect_death(struct fib_info *fi, int order,
return 1;
}
-int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
- u16 encap_type, void *cfg, gfp_t gfp_flags,
+int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
+ struct nlattr *encap, u16 encap_type,
+ void *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack)
{
int err;
@@ -589,8 +590,9 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
err = -EINVAL;
goto lwt_failure;
}
- err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family,
- cfg, &lwtstate, extack);
+ err = lwtunnel_build_state(net, encap_type, encap,
+ nhc->nhc_family, cfg, &lwtstate,
+ extack);
if (err)
goto lwt_failure;
@@ -614,7 +616,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
nh->fib_nh_family = AF_INET;
- err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap,
+ err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
if (err)
return err;
@@ -814,7 +816,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(u16 encap_type,
+static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
const struct fib_nh *nh,
const struct fib_config *cfg,
@@ -826,7 +828,7 @@ static int fib_encap_match(u16 encap_type,
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+ ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
cfg, &lwtstate, extack);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
@@ -836,7 +838,7 @@ static int fib_encap_match(u16 encap_type,
return result;
}
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -857,8 +859,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh = fib_info_nh(fi, 0);
if (cfg->fc_encap) {
- if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
- nh, cfg, extack))
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, nh, cfg, extack))
return 1;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1962,7 +1964,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
- /* fall through */
+ fallthrough;
case NETDEV_CHANGE:
nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
break;
@@ -1984,7 +1986,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
fi->fib_flags |= RTNH_F_DEAD;
- /* fall through */
+ fallthrough;
case NETDEV_CHANGE:
fi->fib_flags |= RTNH_F_LINKDOWN;
break;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ff0c24371e33..4f334b425538 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -35,9 +35,6 @@
* Paul E. McKenney <paulmck@us.ibm.com>
* Patrick McHardy <kaber@trash.net>
*/
-
-#define VERSION "0.409"
-
#include <linux/cache.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
@@ -304,8 +301,6 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
call_rcu(&fa->rcu, __alias_free_mem);
}
-#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
#define TNODE_VMALLOC_MAX \
ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
@@ -1684,7 +1679,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi, extack) == 0 &&
+ fib_nh_match(net, cfg, fi, extack) == 0 &&
fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
@@ -2577,6 +2572,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
" %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
+ rcu_read_lock();
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
@@ -2596,7 +2592,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
trie_show_usage(seq, t->stats);
#endif
}
+ cond_resched_rcu();
}
+ rcu_read_unlock();
return 0;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f369e7ce685b..fc61f51d87a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -865,7 +865,7 @@ static bool icmp_unreach(struct sk_buff *skb)
case 3:
if (!icmp_tag_validation(iph->protocol))
goto out;
- /* fall through */
+ fallthrough;
case 0:
info = ntohs(icmph->un.frag.mtu);
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3b9c7a2725a9..47f0502b2101 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,8 +107,6 @@
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
-#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
-#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
#define IGMP_QUERY_INTERVAL (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d545fb99a8a1..5f34eb951627 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
{
struct sock *sk2;
bool reuse = sk->sk_reuse;
- bool reuseport = !!sk->sk_reuseport && reuseport_ok;
+ bool reuseport = !!sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
@@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if ((!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- (!reuseport || !sk2->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- (sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(uid, sock_i_uid(sk2))))) {
- if (inet_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- if (!relax && reuse && sk2->sk_reuse &&
+ if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
+ if ((!relax ||
+ (!reuseport_ok &&
+ reuseport && sk2->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ (sk2->sk_state == TCP_TIME_WAIT ||
+ uid_eq(uid, sock_i_uid(sk2))))) &&
+ inet_rcv_saddr_equal(sk, sk2, true))
+ break;
+ } else if (!reuseport_ok ||
+ !reuseport || !sk2->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
@@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
+ bool relax = false;
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
+ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +225,7 @@ other_parity_scan:
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
- if (!inet_csk_bind_conflict(sk, tb, false, false))
+ if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success;
goto next_port;
}
@@ -239,6 +245,12 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
+
+ if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
+ /* We still have a chance to connect to different destinations */
+ relax = true;
+ goto ports_exhausted;
+ }
return NULL;
success:
*port_ret = port;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8c8377568a78..5d50aad3cdbf 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -23,6 +23,7 @@
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/inet6_hashtables.h>
+#include <net/bpf_sk_storage.h>
#include <net/netlink.h>
#include <linux/inet.h>
@@ -170,26 +171,28 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
+#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
- struct sk_buff *skb, const struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh,
- bool net_admin)
+ struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ u16 nlmsg_flags, bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
+ struct inet_diag_dump_data *cb_data;
int ext = req->idiag_ext;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
+ cb_data = cb->data;
handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(!handler);
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -201,7 +204,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -298,6 +303,48 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto errout;
}
+ /* Keep it at the end for potential retry with a larger skb,
+ * or else do best-effort fitting, which is only done for the
+ * first_nlmsg.
+ */
+ if (cb_data->bpf_stg_diag) {
+ bool first_nlmsg = ((unsigned char *)nlh == skb->data);
+ unsigned int prev_min_dump_alloc;
+ unsigned int total_nla_size = 0;
+ unsigned int msg_len;
+ int err;
+
+ msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
+ err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
+ INET_DIAG_SK_BPF_STORAGES,
+ &total_nla_size);
+
+ if (!err)
+ goto out;
+
+ total_nla_size += msg_len;
+ prev_min_dump_alloc = cb->min_dump_alloc;
+ if (total_nla_size > prev_min_dump_alloc)
+ cb->min_dump_alloc = min_t(u32, total_nla_size,
+ MAX_DUMP_ALLOC_SIZE);
+
+ if (!first_nlmsg)
+ goto errout;
+
+ if (cb->min_dump_alloc > prev_min_dump_alloc)
+ /* Retry with pskb_expand_head() with
+ * __GFP_DIRECT_RECLAIM
+ */
+ goto errout;
+
+ WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
+
+ /* Send what we have for this sk
+ * and move on to the next sk in the following
+ * dump()
+ */
+ }
+
out:
nlmsg_end(skb, nlh);
return 0;
@@ -308,30 +355,19 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- const struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh,
- bool net_admin)
-{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
- portid, seq, nlmsg_flags, unlh, net_admin);
-}
-
static int inet_twsk_diag_fill(struct sock *sk,
struct sk_buff *skb,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ struct netlink_callback *cb,
+ u16 nlmsg_flags)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -355,16 +391,16 @@ static int inet_twsk_diag_fill(struct sock *sk,
}
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh, bool net_admin)
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
{
struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -393,21 +429,18 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh, bool net_admin)
+ u16 nlmsg_flags, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
+ return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh, net_admin);
+ return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
- return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh, net_admin);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
}
struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -455,10 +488,10 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
+ struct sk_buff *in_skb = cb->skb;
bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
struct net *net = sock_net(in_skb->sk);
struct sk_buff *rep;
@@ -475,10 +508,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
goto out;
}
- err = sk_diag_fill(sk, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh, net_admin);
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -505,14 +535,21 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
int err;
handler = inet_diag_lock_handler(req->sdiag_protocol);
- if (IS_ERR(handler))
+ if (IS_ERR(handler)) {
err = PTR_ERR(handler);
- else if (cmd == SOCK_DIAG_BY_FAMILY)
- err = handler->dump_one(in_skb, nlh, req);
- else if (cmd == SOCK_DESTROY && handler->destroy)
+ } else if (cmd == SOCK_DIAG_BY_FAMILY) {
+ struct inet_diag_dump_data empty_dump_data = {};
+ struct netlink_callback cb = {
+ .nlh = nlh,
+ .skb = in_skb,
+ .data = &empty_dump_data,
+ };
+ err = handler->dump_one(&cb, req);
+ } else if (cmd == SOCK_DESTROY && handler->destroy) {
err = handler->destroy(in_skb, req);
- else
+ } else {
err = -EOPNOTSUPP;
+ }
inet_diag_unlock_handler(handler);
return err;
@@ -843,23 +880,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL;
}
-static int inet_csk_diag_dump(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc,
- bool net_admin)
-{
- if (!inet_diag_bc_sk(bc, sk))
- return 0;
-
- return inet_csk_diag_fill(sk, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
- net_admin);
-}
-
static void twsk_build_assert(void)
{
BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
@@ -888,14 +908,17 @@ static void twsk_build_assert(void)
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
struct net *net = sock_net(skb->sk);
u32 idiag_states = r->idiag_states;
int i, num, s_i, s_num;
+ struct nlattr *bc;
struct sock *sk;
+ bc = cb_data->inet_diag_nla_bc;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
@@ -931,8 +954,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (inet_csk_diag_dump(sk, skb, cb, r,
- bc, net_admin) < 0) {
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
spin_unlock(&ilb->lock);
goto done;
}
@@ -1010,11 +1037,8 @@ next_normal:
res = 0;
for (idx = 0; idx < accum; idx++) {
if (res >= 0) {
- res = sk_diag_fill(sk_arr[idx], skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh, net_admin);
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
if (res < 0)
num = num_arr[idx];
}
@@ -1038,31 +1062,101 @@ out:
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
const struct inet_diag_handler *handler;
+ u32 prev_min_dump_alloc;
int err = 0;
+again:
+ prev_min_dump_alloc = cb->min_dump_alloc;
handler = inet_diag_lock_handler(r->sdiag_protocol);
if (!IS_ERR(handler))
- handler->dump(skb, cb, r, bc);
+ handler->dump(skb, cb, r);
else
err = PTR_ERR(handler);
inet_diag_unlock_handler(handler);
+ /* The skb is not large enough to fit one sk info and
+ * inet_sk_diag_fill() has requested for a larger skb.
+ */
+ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
+ err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
+ if (!err)
+ goto again;
+ }
+
return err ? : skb->len;
}
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- int hdrlen = sizeof(struct inet_diag_req_v2);
- struct nlattr *bc = NULL;
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
+}
- if (nlmsg_attrlen(cb->nlh, hdrlen))
- bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_diag_dump_data *cb_data;
+ struct sk_buff *skb = cb->skb;
+ struct nlattr *nla;
+ int rem, err;
+
+ cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
+ if (!cb_data)
+ return -ENOMEM;
+
+ nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen),
+ nlmsg_attrlen(nlh, hdrlen), rem) {
+ int type = nla_type(nla);
+
+ if (type < __INET_DIAG_REQ_MAX)
+ cb_data->req_nlas[type] = nla;
+ }
+
+ nla = cb_data->inet_diag_nla_bc;
+ if (nla) {
+ err = inet_diag_bc_audit(nla, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
+ }
+ }
+
+ nla = cb_data->inet_diag_nla_bpf_stgs;
+ if (nla) {
+ struct bpf_sk_storage_diag *bpf_stg_diag;
+
+ bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
+ if (IS_ERR(bpf_stg_diag)) {
+ kfree(cb_data);
+ return PTR_ERR(bpf_stg_diag);
+ }
+ cb_data->bpf_stg_diag = bpf_stg_diag;
+ }
+
+ cb->data = cb_data;
+ return 0;
+}
+
+static int inet_diag_dump_start(struct netlink_callback *cb)
+{
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
+}
+
+static int inet_diag_dump_start_compat(struct netlink_callback *cb)
+{
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
+}
- return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+ struct inet_diag_dump_data *cb_data = cb->data;
+
+ bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
+ kfree(cb->data);
+
+ return 0;
}
static int inet_diag_type2proto(int type)
@@ -1081,9 +1175,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
- int hdrlen = sizeof(struct inet_diag_req);
struct inet_diag_req_v2 req;
- struct nlattr *bc = NULL;
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@@ -1091,10 +1183,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- if (nlmsg_attrlen(cb->nlh, hdrlen))
- bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-
- return __inet_diag_dump(skb, cb, &req, bc);
+ return __inet_diag_dump(skb, cb, &req);
}
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
@@ -1122,22 +1211,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- if (nlmsg_attrlen(nlh, hdrlen)) {
- struct nlattr *attr;
- int err;
-
- attr = nlmsg_find_attr(nlh, hdrlen,
- INET_DIAG_REQ_BYTECODE);
- err = inet_diag_bc_audit(attr, skb);
- if (err)
- return err;
- }
- {
- struct netlink_dump_control c = {
- .dump = inet_diag_dump_compat,
- };
- return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
- }
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start_compat,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
}
return inet_diag_get_exact_compat(skb, nlh);
@@ -1153,22 +1232,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
h->nlmsg_flags & NLM_F_DUMP) {
- if (nlmsg_attrlen(h, hdrlen)) {
- struct nlattr *attr;
- int err;
-
- attr = nlmsg_find_attr(h, hdrlen,
- INET_DIAG_REQ_BYTECODE);
- err = inet_diag_bc_audit(attr, skb);
- if (err)
- return err;
- }
- {
- struct netlink_dump_control c = {
- .dump = inet_diag_dump,
- };
- return netlink_dump_start(net->diag_nlsk, skb, h, &c);
- }
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index aa438c6758a7..b0c244af1e4d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d84819893db9..090d3097ee15 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
* insufficent MTU.
*/
features = netif_skb_features(skb);
- BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -333,7 +333,7 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk,
switch (ret) {
case NET_XMIT_CN:
do_cn = true;
- /* fall through */
+ fallthrough;
case NET_XMIT_SUCCESS:
break;
default:
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 74e1d964a615..cd4b84310d92 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -142,11 +142,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
- if (flags & TUNNEL_NO_KEY)
- goto skip_key_lookup;
-
hlist_for_each_entry_rcu(t, head, hash_node) {
- if (t->parms.i_key != key ||
+ if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
t->parms.iph.saddr != 0 ||
t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
@@ -158,7 +155,6 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
-skip_key_lookup:
if (cand)
return cand;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 47f8b947eef1..181b7a2a0247 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -432,7 +432,7 @@ static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
return ip_tun_parse_opts(attr, info, extack);
}
-static int ip_tun_build_state(struct nlattr *attr,
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -719,7 +719,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
[LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
-static int ip6_tun_build_state(struct nlattr *attr,
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 6e68def66822..9cf83cc85e4a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1465,7 +1465,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
- /* fall through */
+ fallthrough;
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc)) {
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f1f78a742b36..b167f4a5b684 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1057,7 +1057,7 @@ struct compat_arpt_replace {
u32 underflow[NF_ARP_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters;
- struct compat_arpt_entry entries[0];
+ struct compat_arpt_entry entries[];
};
static inline void compat_release_entry(struct compat_arpt_entry *e)
@@ -1383,7 +1383,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
struct compat_arpt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_arpt_entry entrytable[0];
+ struct compat_arpt_entry entrytable[];
};
static int compat_get_entries(struct net *net,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 10b91ebdf213..c2670eaa74e6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1211,7 +1211,7 @@ struct compat_ipt_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ipt_entry entries[0];
+ struct compat_ipt_entry entries[];
};
static int
@@ -1562,7 +1562,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
struct compat_ipt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ipt_entry entrytable[0];
+ struct compat_ipt_entry entrytable[];
};
static int
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 4b2d49cc9f1a..0c72156130b6 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -173,7 +173,7 @@ static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
case ICMP_REDIRECT:
/* Max length: 24 "GATEWAY=255.255.255.255 " */
nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
- /* Fall through */
+ fallthrough;
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_TIME_EXCEEDED:
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index b2aeb7bf5dac..3c25a467b3ef 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -168,7 +168,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
pptp_msg_name[0]);
- /* fall through */
+ fallthrough;
case PPTP_SET_LINK_INFO:
/* only need to NAT in case PAC is behind NAT box */
case PPTP_START_SESSION_REQUEST:
@@ -271,7 +271,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
pr_debug("unknown inbound packet %s\n",
msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
pptp_msg_name[0]);
- /* fall through */
+ fallthrough;
case PPTP_START_SESSION_REQUEST:
case PPTP_START_SESSION_REPLY:
case PPTP_STOP_SESSION_REQUEST:
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index d072c326dd64..fdfca534d094 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1327,7 +1327,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
case AF_UNSPEC:
if (tb[NHA_GROUP])
break;
- /* fallthrough */
+ fallthrough;
default:
NL_SET_ERR_MSG(extack, "Invalid address family");
goto out;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2580303249e2..75545a829a2b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -32,6 +32,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <linux/bottom_half.h>
@@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
offsetof(struct ipstats_mib, syncp)));
seq_putc(seq, '\n');
+ mptcp_seq_show(seq);
return 0;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 3183413ebc6c..47665919048f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1034,6 +1034,7 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
}
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&h->lock)
{
struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
@@ -1056,6 +1057,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(raw_seq_next);
void raw_seq_stop(struct seq_file *seq, void *v)
+ __releases(&h->lock)
{
struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index a93e7d1e1251..1b5b8af27aaf 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -87,15 +87,16 @@ out_unlock:
return sk ? sk : ERR_PTR(-ENOENT);
}
-static int raw_diag_dump_one(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int raw_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *in_skb = cb->skb;
struct sk_buff *rep;
struct sock *sk;
+ struct net *net;
int err;
+ net = sock_net(in_skb->sk);
sk = raw_sock_get(net, r);
if (IS_ERR(sk))
return PTR_ERR(sk);
@@ -109,10 +110,7 @@ static int raw_diag_dump_one(struct sk_buff *in_skb,
return -ENOMEM;
}
- err = inet_sk_diag_fill(sk, NULL, rep, r,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
netlink_net_capable(in_skb, CAP_NET_ADMIN));
sock_put(sk);
@@ -137,25 +135,25 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk))
return 0;
- return inet_sk_diag_fill(sk, NULL, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh, net_admin);
+ return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
}
static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
struct sock *sk = NULL;
+ struct nlattr *bc;
if (IS_ERR(hashinfo))
return;
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ebe7060d0fc9..788c69d9bfe0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1621,12 +1621,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct rtable *rt_dst_alloc(struct net_device *dev,
unsigned int flags, u16 type,
- bool nopolicy, bool noxfrm, bool will_cache)
+ bool nopolicy, bool noxfrm)
{
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
@@ -1674,7 +1673,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
new_rt->rt_gw6 = rt->rt_gw6;
INIT_LIST_HEAD(&new_rt->rt_uncached);
- new_rt->dst.flags |= DST_HOST;
new_rt->dst.input = rt->dst.input;
new_rt->dst.output = rt->dst.output;
new_rt->dst.error = rt->dst.error;
@@ -1734,7 +1732,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
return -ENOBUFS;
@@ -1851,7 +1849,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth = rt_dst_alloc(out_dev->dev, 0, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+ IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
@@ -2219,7 +2217,7 @@ local_input:
rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
flags | RTCF_LOCAL, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
@@ -2443,8 +2441,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
add:
rth = rt_dst_alloc(dev_out, flags, type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(in_dev, NOXFRM),
- do_cache);
+ IN_DEV_CONF_GET(in_dev, NOXFRM));
if (!rth)
return ERR_PTR(-ENOBUFS);
@@ -2774,6 +2771,54 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
+struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net, __be32 *saddr,
+ const struct ip_tunnel_info *info,
+ u8 protocol, bool use_cache)
+{
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache *dst_cache;
+#endif
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ __u8 tos;
+
+#ifdef CONFIG_DST_CACHE
+ dst_cache = (struct dst_cache *)&info->dst_cache;
+ if (use_cache) {
+ rt = dst_cache_get_ip4(dst_cache, saddr);
+ if (rt)
+ return rt;
+ }
+#endif
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_proto = protocol;
+ fl4.daddr = info->key.u.ipv4.dst;
+ fl4.saddr = info->key.u.ipv4.src;
+ tos = info->key.tos;
+ fl4.flowi4_tos = RT_TOS(tos);
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (rt->dst.dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+ ip_rt_put(rt);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (use_cache)
+ dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+ *saddr = fl4.saddr;
+ return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
+
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9684af02e0a5..81b267e990a1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -555,18 +555,6 @@ static struct ctl_table ipv4_table[] = {
},
#endif /* CONFIG_NETLABEL */
{
- .procname = "tcp_available_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
- .mode = 0444,
- .proc_handler = proc_tcp_available_congestion_control,
- },
- {
- .procname = "tcp_allowed_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
- .mode = 0644,
- .proc_handler = proc_allowed_congestion_control,
- },
- {
.procname = "tcp_available_ulp",
.maxlen = TCP_ULP_BUF_MAX,
.mode = 0444,
@@ -776,6 +764,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_autobind_reuse",
+ .data = &init_net.ipv4.sysctl_ip_autobind_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect,
.maxlen = sizeof(int),
@@ -886,6 +883,18 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_tcp_congestion_control,
},
{
+ .procname = "tcp_available_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_congestion_control,
+ },
+ {
+ .procname = "tcp_allowed_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0644,
+ .proc_handler = proc_allowed_congestion_control,
+ },
+ {
.procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc77c303e6f7..6d87de434377 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2251,7 +2251,7 @@ void tcp_set_state(struct sock *sk, int state)
if (inet_csk(sk)->icsk_bind_hash &&
!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
inet_put_port(sk);
- /* fall through */
+ fallthrough;
default:
if (oldstate == TCP_ESTABLISHED)
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
@@ -3346,6 +3346,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
0;
}
@@ -3401,6 +3402,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
return stats;
}
@@ -3669,13 +3672,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
if (get_user(len, optlen))
return -EFAULT;
- if (len != sizeof(zc))
+ if (len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
+ if (len > sizeof(zc)) {
+ len = sizeof(zc);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ }
if (copy_from_user(&zc, optval, len))
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
release_sock(sk);
+ if (len == sizeof(zc))
+ goto zerocopy_rcv_sk_err;
+ switch (len) {
+ case offsetofend(struct tcp_zerocopy_receive, err):
+ goto zerocopy_rcv_sk_err;
+ case offsetofend(struct tcp_zerocopy_receive, inq):
+ goto zerocopy_rcv_inq;
+ case offsetofend(struct tcp_zerocopy_receive, length):
+ default:
+ goto zerocopy_rcv_out;
+ }
+zerocopy_rcv_sk_err:
+ if (!err)
+ zc.err = sock_error(sk);
+zerocopy_rcv_inq:
+ zc.inq = tcp_inq_hint(sk);
+zerocopy_rcv_out:
if (!err && copy_to_user(optval, &zc, len))
err = -EFAULT;
return err;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 645cc3009e64..f5f588b1f6e9 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -145,12 +145,13 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else {
- bictcp_update(ca, tp->snd_cwnd);
- tcp_cong_avoid_ai(tp, ca->cnt, 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
+ bictcp_update(ca, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
/*
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a01428f80c1..5a05327f97c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,38 +10,6 @@
#include <net/inet_common.h>
#include <net/tls.h>
-static bool tcp_bpf_stream_read(const struct sock *sk)
-{
- struct sk_psock *psock;
- bool empty = true;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock))
- empty = list_empty(&psock->ingress_msg);
- rcu_read_unlock();
- return !empty;
-}
-
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
- int flags, long timeo, int *err)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- if (!timeo)
- return ret;
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- ret = sk_wait_event(sk, &timeo,
- !list_empty(&psock->ingress_msg) ||
- !skb_queue_empty(&sk->sk_receive_queue), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- return ret;
-}
-
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len, int flags)
{
@@ -115,49 +83,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
}
EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
-{
- struct sk_psock *psock;
- int copied, ret;
-
- psock = sk_psock_get(sk);
- if (unlikely(!psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
- if (!skb_queue_empty(&sk->sk_receive_queue) &&
- sk_psock_queue_empty(psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- lock_sock(sk);
-msg_bytes_ready:
- copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
- if (!copied) {
- int data, err = 0;
- long timeo;
-
- timeo = sock_rcvtimeo(sk, nonblock);
- data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
- if (data) {
- if (!sk_psock_queue_empty(psock))
- goto msg_bytes_ready;
- release_sock(sk);
- sk_psock_put(sk, psock);
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- }
- if (err) {
- ret = err;
- goto out;
- }
- copied = -EAGAIN;
- }
- ret = copied;
-out:
- release_sock(sk);
- sk_psock_put(sk, psock);
- return ret;
-}
-
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -298,6 +223,82 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
}
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+#ifdef CONFIG_BPF_STREAM_PARSER
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, int *copied, int flags)
{
@@ -528,57 +529,6 @@ out_err:
return copied ? copied : err;
}
-static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_link *link;
-
- while ((link = sk_psock_link_pop(psock))) {
- sk_psock_unlink(sk, link);
- sk_psock_free_link(link);
- }
-}
-
-static void tcp_bpf_unhash(struct sock *sk)
-{
- void (*saved_unhash)(struct sock *sk);
- struct sk_psock *psock;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- if (sk->sk_prot->unhash)
- sk->sk_prot->unhash(sk);
- return;
- }
-
- saved_unhash = psock->saved_unhash;
- tcp_bpf_remove(sk, psock);
- rcu_read_unlock();
- saved_unhash(sk);
-}
-
-static void tcp_bpf_close(struct sock *sk, long timeout)
-{
- void (*saved_close)(struct sock *sk, long timeout);
- struct sk_psock *psock;
-
- lock_sock(sk);
- rcu_read_lock();
- psock = sk_psock(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- release_sock(sk);
- return sk->sk_prot->close(sk, timeout);
- }
-
- saved_close = psock->saved_close;
- tcp_bpf_remove(sk, psock);
- rcu_read_unlock();
- release_sock(sk);
- saved_close(sk, timeout);
-}
-
enum {
TCP_BPF_IPV4,
TCP_BPF_IPV6,
@@ -599,8 +549,8 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
struct proto *base)
{
prot[TCP_BPF_BASE] = *base;
- prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
- prot[TCP_BPF_BASE].close = tcp_bpf_close;
+ prot[TCP_BPF_BASE].unhash = sock_map_unhash;
+ prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
@@ -629,26 +579,6 @@ static int __init tcp_bpf_v4_build_proto(void)
}
core_initcall(tcp_bpf_v4_build_proto);
-static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
-{
- int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
- int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
-
- sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
-}
-
-static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
-{
- int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
- int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
-
- /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
- * or added requiring sk_prot hook updates. We keep original saved
- * hooks in this case.
- */
- sk->sk_prot = &tcp_bpf_prots[family][config];
-}
-
static int tcp_bpf_assert_proto_ops(struct proto *ops)
{
/* In order to avoid retpoline, we make assumptions when we call
@@ -660,34 +590,34 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
}
-void tcp_bpf_reinit(struct sock *sk)
+struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock *psock;
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
- sock_owned_by_me(sk);
+ if (!psock->sk_proto) {
+ struct proto *ops = READ_ONCE(sk->sk_prot);
- rcu_read_lock();
- psock = sk_psock(sk);
- tcp_bpf_reinit_sk_prot(sk, psock);
- rcu_read_unlock();
+ if (tcp_bpf_assert_proto_ops(ops))
+ return ERR_PTR(-EINVAL);
+
+ tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ }
+
+ return &tcp_bpf_prots[family][config];
}
-int tcp_bpf_init(struct sock *sk)
+/* If a child got cloned from a listening socket that had tcp_bpf
+ * protocol callbacks installed, we need to restore the callbacks to
+ * the default ones because the child does not inherit the psock state
+ * that tcp_bpf callbacks expect.
+ */
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
{
- struct proto *ops = READ_ONCE(sk->sk_prot);
- struct sk_psock *psock;
-
- sock_owned_by_me(sk);
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ struct proto *prot = newsk->sk_prot;
- rcu_read_lock();
- psock = sk_psock(sk);
- if (unlikely(!psock || psock->sk_proto ||
- tcp_bpf_assert_proto_ops(ops))) {
- rcu_read_unlock();
- return -EINVAL;
- }
- tcp_bpf_check_v6_needs_rebuild(sk, ops);
- tcp_bpf_update_sk_prot(sk, psock);
- rcu_read_unlock();
- return 0;
+ if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
+ newsk->sk_prot = sk->sk_prot_creator;
}
+#endif /* CONFIG_BPF_STREAM_PARSER */
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d08f9e2d8d0..75a1c985f49a 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -179,15 +179,15 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+ inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r);
}
-static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+ return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req);
}
#ifdef CONFIG_INET_DIAG_DESTROY
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6b6b57000dad..bf4ced9273e8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2865,7 +2865,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
- /* fall through */
+ fallthrough;
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -6367,7 +6367,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
mptcp_incoming_options(sk, skb, &tp->rx_opt);
break;
}
- /* fall through */
+ fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
@@ -6382,7 +6382,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 1;
}
}
- /* Fall through */
+ fallthrough;
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index df1166b76126..83a5d24e13b8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1019,7 +1019,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
if (!md5sig)
return NULL;
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
if (key->l3index && key->l3index != l3index)
@@ -1064,7 +1065,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
if (key->l3index && key->l3index != l3index)
@@ -2070,7 +2072,7 @@ do_time_wait:
}
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
@@ -2366,7 +2368,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
break;
st->bucket = 0;
st->state = TCP_SEQ_STATE_ESTABLISHED;
- /* Fallthrough */
+ fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ad3b56d9fa71..7e40322cc5ec 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -548,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+ tcp_bpf_clone(sk, newsk);
+
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
return newsk;
@@ -772,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
+ if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ return child;
+ }
+
sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
*req_stolen = !own_req;
@@ -817,6 +825,7 @@ EXPORT_SYMBOL(tcp_check_req);
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
+ __releases(&((child)->sk_lock.slock))
{
int ret = 0;
int state = child->sk_state;
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 471571e1ab26..6cebf412d590 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -10,10 +10,9 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8. We use 50 instead of 100 to account for
- * delayed ack.
+ * .01 and and 7/8.
*/
-#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -23,11 +22,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
- 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ acked);
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 38d3ad141161..7c27aa629af1 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -22,7 +22,8 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
{
struct tcp_ulp_ops *e;
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+ list_for_each_entry_rcu(e, &tcp_ulp_list, list,
+ lockdep_is_held(&tcp_ulp_list_lock)) {
if (strcmp(e->name, name) == 0)
return e;
}
@@ -104,12 +105,6 @@ void tcp_update_ulp(struct sock *sk, struct proto *proto,
{
struct inet_connection_sock *icsk = inet_csk(sk);
- if (!icsk->icsk_ulp_ops) {
- sk->sk_write_space = write_space;
- sk->sk_prot = proto;
- return;
- }
-
if (icsk->icsk_ulp_ops->update)
icsk->icsk_ulp_ops->update(sk, proto, write_space);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 3b36bb1a0dda..50a9a6e2c4cd 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -153,31 +153,34 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
if (tcp_in_slow_start(tp)) {
- /* Slow start. */
- tcp_slow_start(tp, acked);
+ /* Slow start. */
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto done;
+ }
+
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
} else {
- /* Congestion avoidance. */
- if (veno->diff < beta) {
- /* In the "non-congestive state", increase cwnd
- * every rtt.
- */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
- } else {
- /* In the "congestive state", increase cwnd
- * every other rtt.
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc &&
- tp->snd_cwnd < tp->snd_cwnd_clamp) {
- tp->snd_cwnd++;
- veno->inc = 0;
- } else
- veno->inc = 1;
- tp->snd_cwnd_cnt = 0;
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ tp->snd_cwnd++;
+ veno->inc = 0;
} else
- tp->snd_cwnd_cnt++;
- }
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt += acked;
}
+done:
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index e00570dd0a69..3bb448761ca3 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -36,8 +36,6 @@ struct yeah {
u32 reno_count;
u32 fast_count;
-
- u32 pkts_acked;
};
static void tcp_yeah_init(struct sock *sk)
@@ -57,18 +55,6 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk,
- const struct ack_sample *sample)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct yeah *yeah = inet_csk_ca(sk);
-
- if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = sample->pkts_acked;
-
- tcp_vegas_pkts_acked(sk, sample);
-}
-
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -77,24 +63,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto do_vegas;
+ }
- else if (!yeah->doing_reno_now) {
+ if (!yeah->doing_reno_now) {
/* Scalable */
-
- tp->snd_cwnd_cnt += yeah->pkts_acked;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
-
- yeah->pkts_acked = 1;
-
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ acked);
} else {
/* Reno */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
@@ -118,7 +99,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* of bytes we send in an RTT is often less than our cwnd will allow.
* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
*/
-
+do_vegas:
if (after(ack, yeah->vegas.beg_snd_nxt)) {
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -232,7 +213,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
- .pkts_acked = tcp_yeah_pkts_acked,
+ .pkts_acked = tcp_vegas_pkts_acked,
.owner = THIS_MODULE,
.name = "yeah",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 08a41f1e1cd2..32564b350823 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1671,10 +1671,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
error = -EAGAIN;
do {
spin_lock_bh(&queue->lock);
- skb = __skb_try_recv_from_queue(sk, queue, flags,
- udp_skb_destructor,
- off, err, &last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+ err, &last);
if (skb) {
+ if (!(flags & MSG_PEEK))
+ udp_skb_destructor(sk, skb);
spin_unlock_bh(&queue->lock);
return skb;
}
@@ -1692,9 +1693,10 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
- skb = __skb_try_recv_from_queue(sk, queue, flags,
- udp_skb_dtor_locked,
- off, err, &last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+ err, &last);
+ if (skb && !(flags & MSG_PEEK))
+ udp_skb_dtor_locked(sk, skb);
spin_unlock(&sk_queue->lock);
spin_unlock_bh(&queue->lock);
if (skb)
@@ -2107,7 +2109,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (likely(!udp_unexpected_gso(sk, skb)))
return udp_queue_rcv_one_skb(sk, skb);
- BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
__skb_push(skb, -skb_mac_offset(skb));
segs = udp_rcv_segment(sk, skb, true);
skb_list_walk_safe(segs, skb, next) {
@@ -2286,6 +2288,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
+ bool refcounted;
/*
* Validate the packet.
@@ -2311,7 +2314,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -2320,7 +2323,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
@@ -2561,7 +2565,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_ENCAP_ESPINUDP_NON_IKE:
up->encap_rcv = xfrm4_udp_encap_rcv;
#endif
- /* FALLTHROUGH */
+ fallthrough;
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
lock_sock(sk);
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
new file mode 100644
index 000000000000..eddd973e6575
--- /dev/null
+++ b/net/ipv4/udp_bpf.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */
+
+#include <linux/skmsg.h>
+#include <net/sock.h>
+#include <net/udp.h>
+
+enum {
+ UDP_BPF_IPV4,
+ UDP_BPF_IPV6,
+ UDP_BPF_NUM_PROTS,
+};
+
+static struct proto *udpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(udpv6_prot_lock);
+static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
+
+static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+ *prot = *base;
+ prot->unhash = sock_map_unhash;
+ prot->close = sock_map_close;
+}
+
+static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+ spin_lock_bh(&udpv6_prot_lock);
+ if (likely(ops != udpv6_prot_saved)) {
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
+ smp_store_release(&udpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&udpv6_prot_lock);
+ }
+}
+
+static int __init udp_bpf_v4_build_proto(void)
+{
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot);
+ return 0;
+}
+core_initcall(udp_bpf_v4_build_proto);
+
+struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+
+ if (!psock->sk_proto)
+ udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot));
+
+ return &udp_bpf_prots[family];
+}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index dccd2286bc28..1dbece34496e 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -21,16 +21,15 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk))
return 0;
- return inet_sk_diag_fill(sk, NULL, skb, req,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
+ return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
+ net_admin);
}
-static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int udp_dump_one(struct udp_table *tbl,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
+ struct sk_buff *in_skb = cb->skb;
int err = -EINVAL;
struct sock *sk = NULL;
struct sk_buff *rep;
@@ -71,11 +70,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
if (!rep)
goto out;
- err = inet_sk_diag_fill(sk, NULL, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
- netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
@@ -94,12 +90,16 @@ out_nosk:
static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
+ struct nlattr *bc;
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -147,15 +147,15 @@ done:
}
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- udp_dump(&udp_table, skb, cb, r, bc);
+ udp_dump(&udp_table, skb, cb, r);
}
-static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udp_table, in_skb, nlh, req);
+ return udp_dump_one(&udp_table, cb, req);
}
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -250,16 +250,15 @@ static const struct inet_diag_handler udp_diag_handler = {
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- udp_dump(&udplite_table, skb, cb, r, bc);
+ udp_dump(&udplite_table, skb, cb, r);
}
-static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udplite_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udplite_table, in_skb, nlh, req);
+ return udp_dump_one(&udplite_table, cb, req);
}
static const struct inet_diag_handler udplite_diag_handler = {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 1a98583a79f4..e67a66fbf27b 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -453,6 +453,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ NAPI_GRO_CB(skb)->is_flist = 0;
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;