summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/bpf_tcp_ca.c28
-rw-r--r--net/ipv4/fib_frontend.c30
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_rules.c19
-rw-r--r--net/ipv4/fib_semantics.c58
-rw-r--r--net/ipv4/fib_trie.c58
-rw-r--r--net/ipv4/icmp.c91
-rw-r--r--net/ipv4/inet_connection_sock.c8
-rw-r--r--net/ipv4/inet_hashtables.c53
-rw-r--r--net/ipv4/inet_timewait_sock.c75
-rw-r--r--net/ipv4/ip_input.c31
-rw-r--r--net/ipv4/ip_options.c31
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ipmr.c20
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c24
-rw-r--r--net/ipv4/nexthop.c12
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/route.c39
-rw-r--r--net/ipv4/sysctl_net_ipv4.c20
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_bbr.c18
-rw-r--r--net/ipv4/tcp_cubic.c17
-rw-r--r--net/ipv4/tcp_dctcp.c18
-rw-r--r--net/ipv4/tcp_input.c11
-rw-r--r--net/ipv4/tcp_ipv4.c75
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/udp.c22
28 files changed, 396 insertions, 391 deletions
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index de610cb83694..f79ab942f03b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
+#include <linux/init.h>
#include <linux/types.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
@@ -95,12 +96,14 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
- u32 *next_btf_id)
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
{
size_t end;
if (atype == BPF_READ)
- return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
@@ -212,26 +215,23 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
}
}
-BTF_SET_START(bpf_tcp_ca_kfunc_ids)
+BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
BTF_ID(func, tcp_reno_ssthresh)
BTF_ID(func, tcp_reno_cong_avoid)
BTF_ID(func, tcp_reno_undo_cwnd)
BTF_ID(func, tcp_slow_start)
BTF_ID(func, tcp_cong_avoid_ai)
-BTF_SET_END(bpf_tcp_ca_kfunc_ids)
+BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
-static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner)
-{
- if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id))
- return true;
- return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner);
-}
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &bpf_tcp_ca_check_kfunc_ids,
+};
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
.get_func_proto = bpf_tcp_ca_get_func_proto,
.is_valid_access = bpf_tcp_ca_is_valid_access,
.btf_struct_access = bpf_tcp_ca_btf_struct_access,
- .check_kfunc_call = bpf_tcp_ca_check_kfunc_call,
};
static int bpf_tcp_ca_init_member(const struct btf_type *t,
@@ -300,3 +300,9 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
.init = bpf_tcp_ca_init,
.name = "tcp_congestion_ops",
};
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 85117b45216d..e0730c4d07d6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -738,8 +739,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
memset(cfg, 0, sizeof(*cfg));
rtm = nlmsg_data(nlh);
+
+ if (!inet_validate_dscp(rtm->rtm_tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ err = -EINVAL;
+ goto errout;
+ }
+ cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_tos = rtm->rtm_tos;
cfg->fc_table = rtm->rtm_table;
cfg->fc_protocol = rtm->rtm_protocol;
cfg->fc_scope = rtm->rtm_scope;
@@ -1550,7 +1559,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
- rtnl_lock();
+ ASSERT_RTNL();
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1575,7 +1584,7 @@ static void ip_fib_net_exit(struct net *net)
#ifdef CONFIG_IP_MULTIPLE_TABLES
fib4_rules_exit(net);
#endif
- rtnl_unlock();
+
kfree(net->ipv4.fib_table_hash);
fib4_notifier_exit(net);
}
@@ -1602,7 +1611,9 @@ out:
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
+ rtnl_lock();
ip_fib_net_exit(net);
+ rtnl_unlock();
goto out;
}
@@ -1610,12 +1621,23 @@ static void __net_exit fib_net_exit(struct net *net)
{
fib_proc_exit(net);
nl_fib_lookup_exit(net);
- ip_fib_net_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip_fib_net_exit(net);
+
+ rtnl_unlock();
}
static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
+ .exit_batch = fib_net_exit_batch,
};
void __init ip_fib_init(void)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 78e40ea42e58..f9b9e26c32c1 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -4,13 +4,14 @@
#include <linux/types.h>
#include <linux/list.h>
+#include <net/inet_dscp.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
struct fib_alias {
struct hlist_node fa_list;
struct fib_info *fa_info;
- u8 fa_tos;
+ dscp_t fa_dscp;
u8 fa_type;
u8 fa_state;
u8 fa_slen;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index e0b6c8b6de57..001fea394bde 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -35,7 +36,7 @@ struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
- u8 tos;
+ dscp_t dscp;
__be32 src;
__be32 srcmask;
__be32 dst;
@@ -49,7 +50,7 @@ static bool fib4_rule_matchall(const struct fib_rule *rule)
{
struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
- if (r->dst_len || r->src_len || r->tos)
+ if (r->dst_len || r->src_len || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -185,7 +186,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl4->flowi4_tos))
+ if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
@@ -225,10 +226,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ /* IPv4 currently doesn't handle high order DSCP bits correctly */
if (frh->tos & ~IPTOS_TOS_MASK) {
NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
}
+ rule4->dscp = inet_dsfield_to_dscp(frh->tos);
/* split local/main if they are not already split */
err = fib_unmerge(net);
@@ -270,7 +278,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->srcmask = inet_make_mask(rule4->src_len);
rule4->dst_len = frh->dst_len;
rule4->dstmask = inet_make_mask(rule4->dst_len);
- rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
@@ -313,7 +320,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
- if (frh->tos && (rule4->tos != frh->tos))
+ if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
return 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -337,7 +344,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
- frh->tos = rule4->tos;
+ frh->tos = inet_dscp_to_dsfield(rule4->dscp);
if ((rule4->dst_len &&
nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2dd375f7407b..c5a29703185a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
#include <linux/hash.h>
#include <net/arp.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -523,7 +524,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
fri.tb_id = tb_id;
fri.dst = key;
fri.dst_len = dst_len;
- fri.tos = fa->fa_tos;
+ fri.tos = inet_dscp_to_dsfield(fa->fa_dscp);
fri.type = fa->fa_type;
fri.offload = READ_ONCE(fa->offload);
fri.trap = READ_ONCE(fa->trap);
@@ -1257,34 +1258,13 @@ fib_info_laddrhash_bucket(const struct net *net, __be32 val)
return &fib_info_laddrhash[slot];
}
-static struct hlist_head *fib_info_hash_alloc(int bytes)
-{
- if (bytes <= PAGE_SIZE)
- return kzalloc(bytes, GFP_KERNEL);
- else
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(bytes));
-}
-
-static void fib_info_hash_free(struct hlist_head *hash, int bytes)
-{
- if (!hash)
- return;
-
- if (bytes <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long) hash, get_order(bytes));
-}
-
static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *new_laddrhash,
unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_info_hash_size;
- unsigned int i, bytes;
+ unsigned int i;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
@@ -1325,9 +1305,8 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
- bytes = old_size * sizeof(struct hlist_head *);
- fib_info_hash_free(old_info_hash, bytes);
- fib_info_hash_free(old_laddrhash, bytes);
+ kvfree(old_info_hash);
+ kvfree(old_laddrhash);
}
__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
@@ -1444,19 +1423,19 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
- unsigned int bytes;
+ size_t bytes;
if (!new_size)
new_size = 16;
- bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_info_hash_alloc(bytes);
- new_laddrhash = fib_info_hash_alloc(bytes);
+ bytes = (size_t)new_size * sizeof(struct hlist_head *);
+ new_info_hash = kvzalloc(bytes, GFP_KERNEL);
+ new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
if (!new_info_hash || !new_laddrhash) {
- fib_info_hash_free(new_info_hash, bytes);
- fib_info_hash_free(new_laddrhash, bytes);
- } else
+ kvfree(new_info_hash);
+ kvfree(new_laddrhash);
+ } else {
fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
-
+ }
if (!fib_info_hash_size)
goto failure;
}
@@ -2061,7 +2040,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
int order = -1, last_idx = -1;
struct fib_alias *fa, *fa1 = NULL;
u32 last_prio = res->fi->fib_priority;
- u8 last_tos = 0;
+ dscp_t last_dscp = 0;
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
@@ -2069,19 +2048,20 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
if (fa->fa_slen != slen)
continue;
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp &&
+ fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos))
continue;
if (fa->tb_id != tb->tb_id)
continue;
if (next_fi->fib_priority > last_prio &&
- fa->fa_tos == last_tos) {
- if (last_tos)
+ fa->fa_dscp == last_dscp) {
+ if (last_dscp)
continue;
break;
}
if (next_fi->fib_flags & RTNH_F_DEAD)
continue;
- last_tos = fa->fa_tos;
+ last_dscp = fa->fa_dscp;
last_prio = next_fi->fib_priority;
if (next_fi->fib_scope != res->scope ||
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f7f74d5c14da..2af2b99e0bea 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -61,6 +61,7 @@
#include <linux/vmalloc.h>
#include <linux/notifier.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -81,7 +82,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .tos = inet_dscp_to_dsfield(fa->fa_dscp),
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -98,7 +99,7 @@ static int call_fib_entry_notifiers(struct net *net,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .tos = inet_dscp_to_dsfield(fa->fa_dscp),
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -973,13 +974,13 @@ static struct key_vector *fib_find_node(struct trie *t,
return n;
}
-/* Return the first fib alias matching TOS with
+/* Return the first fib alias matching DSCP with
* priority less than or equal to PRIO.
* If 'find_first' is set, return the first matching
- * fib alias, regardless of TOS and priority.
+ * fib alias, regardless of DSCP and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
- u8 tos, u32 prio, u32 tb_id,
+ dscp_t dscp, u32 prio, u32 tb_id,
bool find_first)
{
struct fib_alias *fa;
@@ -988,6 +989,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
return NULL;
hlist_for_each_entry(fa, fah, fa_list) {
+ /* Avoid Sparse warning when using dscp_t in inequalities */
+ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+ u8 __dscp = inet_dscp_to_dsfield(dscp);
+
if (fa->fa_slen < slen)
continue;
if (fa->fa_slen != slen)
@@ -998,9 +1003,9 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
break;
if (find_first)
return fa;
- if (fa->fa_tos > tos)
+ if (__fa_dscp > __dscp)
continue;
- if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
return fa;
}
@@ -1027,8 +1032,8 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
- fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
- fa->fa_type == fri->type)
+ fa->fa_dscp == inet_dsfield_to_dscp(fri->tos) &&
+ fa->fa_info == fri->fi && fa->fa_type == fri->type)
return fa;
}
@@ -1214,7 +1219,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
int err;
@@ -1231,12 +1236,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto err;
}
+ dscp = cfg->fc_dscp;
l = fib_find_node(t, &tp, key);
- fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
+ * with the same keys [prefix,dscp,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
@@ -1244,7 +1250,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
* of the new alias.
*/
- if (fa && fa->fa_tos == tos &&
+ if (fa && fa->fa_dscp == dscp &&
fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_first, *fa_match;
@@ -1264,7 +1270,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
hlist_for_each_entry_from(fa, fa_list) {
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1292,7 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
fi_drop = fa->fa_info;
- new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
@@ -1355,7 +1361,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
+ new_fa->fa_dscp = dscp;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
@@ -1571,7 +1577,8 @@ found:
if (index >= (1ul << fa->fa_slen))
continue;
}
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp &&
+ inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
continue;
if (fi->fib_dead)
continue;
@@ -1707,7 +1714,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
struct key_vector *l, *tp;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
key = ntohl(cfg->fc_dst);
@@ -1719,11 +1726,13 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!l)
return -ESRCH;
- fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
+ dscp = cfg->fc_dscp;
+ fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
- pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+ inet_dscp_to_dsfield(dscp), t);
fa_to_delete = NULL;
hlist_for_each_entry_from(fa, fa_list) {
@@ -1731,7 +1740,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -2299,7 +2308,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
fri.tb_id = tb->tb_id;
fri.dst = xkey;
fri.dst_len = KEYLENGTH - fa->fa_slen;
- fri.tos = fa->fa_tos;
+ fri.tos = inet_dscp_to_dsfield(fa->fa_dscp);
fri.type = fa->fa_type;
fri.offload = READ_ONCE(fa->offload);
fri.trap = READ_ONCE(fa->trap);
@@ -2811,8 +2820,9 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
fa->fa_info->fib_scope),
rtn_type(buf2, sizeof(buf2),
fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
+ if (fa->fa_dscp)
+ seq_printf(seq, " tos=%d",
+ inet_dscp_to_dsfield(fa->fa_dscp));
seq_putc(seq, '\n');
}
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b7e277d8a84d..72a375c7f417 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -192,24 +192,14 @@ struct icmp_control {
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmp_sk(struct net *net)
-{
- return this_cpu_read(*net->ipv4.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmp_sk(net);
+ sk = this_cpu_read(ipv4_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
@@ -217,11 +207,13 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
static inline void icmp_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
@@ -363,14 +355,13 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
return 0;
}
-static void icmp_push_reply(struct icmp_bxm *icmp_param,
+static void icmp_push_reply(struct sock *sk,
+ struct icmp_bxm *icmp_param,
struct flowi4 *fl4,
struct ipcm_cookie *ipc, struct rtable **rt)
{
- struct sock *sk;
struct sk_buff *skb;
- sk = icmp_sk(dev_net((*rt)->dst.dev));
if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
@@ -452,7 +443,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
if (IS_ERR(rt))
goto out_unlock;
if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
@@ -766,7 +757,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);
- icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
@@ -1434,46 +1425,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
},
};
-static void __net_exit icmp_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
- free_percpu(net->ipv4.icmp_sk);
- net->ipv4.icmp_sk = NULL;
-}
-
static int __net_init icmp_sk_init(struct net *net)
{
- int i, err;
-
- net->ipv4.icmp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.icmp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(i) {
- struct sock *sk;
-
- err = inet_ctl_sock_create(&sk, PF_INET,
- SOCK_RAW, IPPROTO_ICMP, net);
- if (err < 0)
- goto fail;
-
- *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
-
- /* Enough space for 2 64K ICMP packets, including
- * sk_buff/skb_shared_info struct overhead.
- */
- sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
-
- /*
- * Speedup sock_wfree()
- */
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
- }
-
/* Control parameters for ECHO replies. */
net->ipv4.sysctl_icmp_echo_ignore_all = 0;
net->ipv4.sysctl_icmp_echo_enable_probe = 0;
@@ -1499,18 +1452,36 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
return 0;
-
-fail:
- icmp_sk_exit(net);
- return err;
}
static struct pernet_operations __net_initdata icmp_sk_ops = {
.init = icmp_sk_init,
- .exit = icmp_sk_exit,
};
int __init icmp_init(void)
{
+ int err, i;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, &init_net);
+ if (err < 0)
+ return err;
+
+ per_cpu(ipv4_icmp_sk, i) = sk;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff/skb_shared_info struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+ }
return register_pernet_subsys(&icmp_sk_ops);
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fc2a985f6064..1e5b53c2bb26 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -866,12 +866,9 @@ static void reqsk_timer_handler(struct timer_list *t)
(!resend ||
!inet_rtx_syn_ack(sk_listener, req) ||
inet_rsk(req)->acked)) {
- unsigned long timeo;
-
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
- timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
- mod_timer(&req->rsk_timer, jiffies + timeo);
+ mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
if (!nreq)
return;
@@ -1046,6 +1043,9 @@ int inet_csk_listen_start(struct sock *sk)
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
+ if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
+ sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 30ab717ff1b8..17440840a791 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
+ local_bh_disable();
inet_ehash_nolisten(sk, osk, NULL);
+ local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
@@ -669,45 +671,54 @@ int inet_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
EXPORT_SYMBOL_GPL(inet_hash);
-void inet_unhash(struct sock *sk)
+static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_listen_hashbucket *ilb = NULL;
- spinlock_t *lock;
-
if (sk_unhashed(sk))
return;
- if (sk->sk_state == TCP_LISTEN) {
- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &ilb->lock;
- } else {
- lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- }
- spin_lock_bh(lock);
- if (sk_unhashed(sk))
- goto unlock;
-
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_stop_listen_sock(sk);
if (ilb) {
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
inet_unhash2(hashinfo, sk);
ilb->count--;
}
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-unlock:
- spin_unlock_bh(lock);
+}
+
+void inet_unhash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+ if (sk_unhashed(sk))
+ return;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ struct inet_listen_hashbucket *ilb;
+
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ /* Don't disable bottom halves while acquiring the lock to
+ * avoid circular locking dependency on PREEMPT_RT.
+ */
+ spin_lock(&ilb->lock);
+ __inet_unhash(sk, ilb);
+ spin_unlock(&ilb->lock);
+ } else {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ __inet_unhash(sk, NULL);
+ spin_unlock_bh(lock);
+ }
}
EXPORT_SYMBOL_GPL(inet_unhash);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 437afe392e66..9e0bbd026560 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -52,14 +52,15 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
spin_unlock(lock);
/* Disassociate with bind bucket. */
- bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
- hashinfo->bhash_size)];
+ bhead = &hashinfo->bhash[tw->tw_bslot];
spin_lock(&bhead->lock);
inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- atomic_dec(&tw->tw_dr->tw_count);
+ if (refcount_dec_and_test(&tw->tw_dr->tw_refcount))
+ kfree(tw->tw_dr);
+
inet_twsk_put(tw);
}
@@ -110,8 +111,12 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed.
*/
- bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
- hashinfo->bhash_size)];
+ /* Cache inet_bhashfn(), because 'struct net' might be no longer
+ * available later in inet_twsk_kill().
+ */
+ tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num,
+ hashinfo->bhash_size);
+ bhead = &hashinfo->bhash[tw->tw_bslot];
spin_lock(&bhead->lock);
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
@@ -145,10 +150,6 @@ static void tw_timer_handler(struct timer_list *t)
{
struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
- if (tw->tw_kill)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
- else
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
@@ -158,7 +159,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
{
struct inet_timewait_sock *tw;
- if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ if (refcount_read(&dr->tw_refcount) - 1 >= dr->sysctl_max_tw_buckets)
return NULL;
tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
@@ -244,59 +245,15 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
* of PAWS.
*/
- tw->tw_kill = timeo <= 4*HZ;
if (!rearm) {
+ bool kill = timeo <= 4*HZ;
+
+ __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+ LINUX_MIB_TIMEWAITED);
BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
- atomic_inc(&tw->tw_dr->tw_count);
+ refcount_inc(&tw->tw_dr->tw_refcount);
} else {
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
-{
- struct inet_timewait_sock *tw;
- struct sock *sk;
- struct hlist_nulls_node *node;
- unsigned int slot;
-
- for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
-restart_rcu:
- cond_resched();
- rcu_read_lock();
-restart:
- sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (sk->sk_state != TCP_TIME_WAIT)
- continue;
- tw = inet_twsk(sk);
- if ((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->ns.count))
- continue;
-
- if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
- continue;
-
- if (unlikely((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->ns.count))) {
- inet_twsk_put(tw);
- goto restart;
- }
-
- rcu_read_unlock();
- local_bh_disable();
- inet_twsk_deschedule_put(tw);
- local_bh_enable();
- goto restart_rcu;
- }
- /* If the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto restart;
- rcu_read_unlock();
- }
-}
-EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3a025c011971..d94f9f7e60c3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -196,7 +196,8 @@ resubmit:
if (ipprot) {
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_XFRM_POLICY);
return;
}
nf_reset_ct(skb);
@@ -215,7 +216,7 @@ resubmit:
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
@@ -318,8 +319,10 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
+ int err, drop_reason;
struct rtable *rt;
- int err;
+
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (ip_can_use_hint(skb, iph, hint)) {
err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
@@ -396,19 +399,23 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
- IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+ drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
goto drop;
+ }
}
return NET_RX_SUCCESS;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return NET_RX_DROP;
drop_error:
- if (err == -EXDEV)
+ if (err == -EXDEV) {
+ drop_reason = SKB_DROP_REASON_IP_RPFILTER;
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+ }
goto drop;
}
@@ -436,13 +443,16 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
+ int drop_reason;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
- if (skb->pkt_type == PACKET_OTHERHOST)
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ drop_reason = SKB_DROP_REASON_OTHERHOST;
goto drop;
+ }
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
@@ -452,6 +462,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
goto out;
}
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
@@ -488,6 +499,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
len = ntohs(iph->tot_len);
if (skb->len < len) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
@@ -516,11 +528,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
return skb;
csum_error:
+ drop_reason = SKB_DROP_REASON_IP_CSUM;
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
+ if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ drop_reason = SKB_DROP_REASON_IP_INHDR;
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out:
return NULL;
}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index da1b5038bdfd..a9e22a098872 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -42,7 +42,7 @@
*/
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
- __be32 daddr, struct rtable *rt, int is_frag)
+ __be32 daddr, struct rtable *rt)
{
unsigned char *iph = skb_network_header(skb);
@@ -53,28 +53,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
if (opt->srr)
memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
- if (!is_frag) {
- if (opt->rr_needaddr)
- ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
- if (opt->ts_needaddr)
- ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
- if (opt->ts_needtime) {
- __be32 midtime;
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+ if (opt->ts_needtime) {
+ __be32 midtime;
- midtime = inet_current_timestamp();
- memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
- }
- return;
- }
- if (opt->rr) {
- memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]);
- opt->rr = 0;
- opt->rr_needaddr = 0;
- }
- if (opt->ts) {
- memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]);
- opt->ts = 0;
- opt->ts_needaddr = opt->ts_needtime = 0;
+ midtime = inet_current_timestamp();
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 139cec29ed06..0c0574eb5f5b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -179,7 +179,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
- ip_options_build(skb, &opt->opt, daddr, rt, 0);
+ ip_options_build(skb, &opt->opt, daddr, rt);
}
skb->priority = sk->sk_priority;
@@ -519,7 +519,7 @@ packet_routed:
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
- ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
}
ip_select_ident_segs(net, skb, sk,
@@ -1541,7 +1541,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (opt) {
iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, cork->addr, rt, 0);
+ ip_options_build(skb, opt, cork->addr, rt);
}
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 29bbe2b08ae9..c860519d57ee 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -268,13 +268,12 @@ static void __net_exit ipmr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
list_del(&mrt->list);
ipmr_free_table(mrt);
}
fib_rules_unregister(net->ipv4.mr_rules_ops);
- rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -330,10 +329,9 @@ static int __net_init ipmr_rules_init(struct net *net)
static void __net_exit ipmr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ipmr_free_table(net->ipv4.mrt);
net->ipv4.mrt = NULL;
- rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -3077,7 +3075,9 @@ static int __net_init ipmr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ipmr_rules_exit(net);
+ rtnl_unlock();
#endif
ipmr_rules_fail:
ipmr_notifier_exit(net);
@@ -3092,12 +3092,22 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_notifier_exit(net);
- ipmr_rules_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ipmr_rules_exit(net);
+ rtnl_unlock();
}
static struct pernet_operations ipmr_net_ops = {
.init = ipmr_net_init,
.exit = ipmr_net_exit,
+ .exit_batch = ipmr_net_exit_batch,
};
int __init ip_mr_init(void)
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 3f248a19faa3..fab357cc8559 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -295,28 +295,24 @@ pptp_inbound_pkt(struct sk_buff *skb,
return NF_ACCEPT;
}
+static const struct nf_nat_pptp_hook pptp_hooks = {
+ .outbound = pptp_outbound_pkt,
+ .inbound = pptp_inbound_pkt,
+ .exp_gre = pptp_exp_gre,
+ .expectfn = pptp_nat_expected,
+};
+
static int __init nf_nat_helper_pptp_init(void)
{
- BUG_ON(nf_nat_pptp_hook_outbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_inbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+ WARN_ON(nf_nat_pptp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
- BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
return 0;
}
static void __exit nf_nat_helper_pptp_fini(void)
{
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
synchronize_rcu();
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index eeafeccebb8d..e459a391e607 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -3733,12 +3733,16 @@ out:
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);
-static void __net_exit nexthop_net_exit(struct net *net)
+static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- flush_all_nexthops(net);
+ list_for_each_entry(net, net_list, exit_list) {
+ flush_all_nexthops(net);
+ kfree(net->nexthop.devhash);
+ }
rtnl_unlock();
- kfree(net->nexthop.devhash);
}
static int __net_init nexthop_net_init(struct net *net)
@@ -3756,7 +3760,7 @@ static int __net_init nexthop_net_init(struct net *net)
static struct pernet_operations nexthop_net_ops = {
.init = nexthop_net_init,
- .exit = nexthop_net_exit,
+ .exit_batch = nexthop_net_exit_batch,
};
static int __init nexthop_init(void)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f30273afb539..28836071f0a6 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,8 +59,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
- proto_memory_allocated(&tcp_prot));
+ refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1,
+ sockets, proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f33ad1f383b6..f444f5983405 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -84,6 +84,7 @@
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
@@ -112,14 +113,13 @@
#define DEFAULT_MIN_PMTU (512 + 20 + 20)
#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
-
+#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_min_advmss __read_mostly = 256;
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
@@ -458,7 +458,7 @@ static u32 *ip_tstamps __read_mostly;
* if one generator is seldom used. This makes hard for an attacker
* to infer how many packets were sent between two points in time.
*/
-u32 ip_idents_reserve(u32 hash, int segs)
+static u32 ip_idents_reserve(u32 hash, int segs)
{
u32 bucket, old, now = (u32)jiffies;
atomic_t *p_id;
@@ -479,7 +479,6 @@ u32 ip_idents_reserve(u32 hash, int segs)
*/
return atomic_add_return(segs + delta, p_id) - segs;
}
-EXPORT_SYMBOL(ip_idents_reserve);
void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
@@ -1298,9 +1297,10 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
+ struct net *net = dev_net(dst->dev);
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
- ip_rt_min_advmss);
+ net->ipv4.ip_rt_min_advmss);
return min(advmss, IPV4_MAX_PMTU - header_size);
}
@@ -1485,6 +1485,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
struct uncached_list {
spinlock_t lock;
struct list_head head;
+ struct list_head quarantine;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
@@ -1506,7 +1507,7 @@ void rt_del_uncached_list(struct rtable *rt)
struct uncached_list *ul = rt->rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del(&rt->rt_uncached);
+ list_del_init(&rt->rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
@@ -1521,20 +1522,24 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
void rt_flush_dev(struct net_device *dev)
{
- struct rtable *rt;
+ struct rtable *rt, *safe;
int cpu;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+ if (list_empty(&ul->head))
+ continue;
+
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
rt->dst.dev = blackhole_netdev;
dev_replace_track(dev, blackhole_netdev,
&rt->dst.dev_tracker,
GFP_ATOMIC);
+ list_move(&rt->rt_uncached, &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
}
@@ -3392,7 +3397,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (fa->fa_slen == slen &&
fa->tb_id == fri.tb_id &&
- fa->fa_tos == fri.tos &&
+ fa->fa_dscp == inet_dsfield_to_dscp(fri.tos) &&
fa->fa_info == res.fi &&
fa->fa_type == fri.type) {
fri.offload = READ_ONCE(fa->offload);
@@ -3535,13 +3540,6 @@ static struct ctl_table ipv4_route_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "min_adv_mss",
- .data = &ip_rt_min_advmss,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
@@ -3569,6 +3567,13 @@ static struct ctl_table ipv4_route_netns_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "min_adv_mss",
+ .data = &init_net.ipv4.ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ },
};
@@ -3631,6 +3636,7 @@ static __net_init int netns_ip_rt_init(struct net *net)
/* Set default value for namespaceified sysctls */
net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+ net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
return 0;
}
@@ -3705,6 +3711,7 @@ int __init ip_rt_init(void)
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
+ INIT_LIST_HEAD(&ul->quarantine);
spin_lock_init(&ul->lock);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 97eb54774924..1cae27b5dcd8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -589,6 +589,14 @@ static struct ctl_table ipv4_table[] = {
};
static struct ctl_table ipv4_net_table[] = {
+ /* tcp_max_tw_buckets must be first in this table. */
+ {
+ .procname = "tcp_max_tw_buckets",
+/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{
.procname = "icmp_echo_ignore_all",
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
@@ -1001,13 +1009,6 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &two,
},
{
- .procname = "tcp_max_tw_buckets",
- .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
@@ -1400,7 +1401,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table)
goto err_alloc;
- for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+ /* skip first entry (sysctl_max_tw_buckets) */
+ for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
if (table[i].data) {
/* Update the variables to point into
* the current struct net
@@ -1415,6 +1417,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
}
}
+ table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
+
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (!net->ipv4.ipv4_hdr)
goto err_reg;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 02cb275e5487..760e8221d321 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -894,8 +894,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return mss_now;
/* Note : tcp_tso_autosize() will eventually split this later */
- new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
- new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
/* We try hard to avoid divides here */
size_goal = tp->gso_segs * mss_now;
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index ec5550089b4d..02e8626ccb27 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1154,7 +1154,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.set_state = bbr_set_state,
};
-BTF_SET_START(tcp_bbr_kfunc_ids)
+BTF_SET_START(tcp_bbr_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, bbr_init)
@@ -1167,25 +1167,27 @@ BTF_ID(func, bbr_min_tso_segs)
BTF_ID(func, bbr_set_state)
#endif
#endif
-BTF_SET_END(tcp_bbr_kfunc_ids)
+BTF_SET_END(tcp_bbr_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_bbr_check_kfunc_ids,
+};
static int __init bbr_register(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
- ret = tcp_register_congestion_control(&tcp_bbr_cong_ops);
- if (ret)
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
}
static void __exit bbr_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
}
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index e07837e23b3f..24d562dd6225 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.name = "cubic",
};
-BTF_SET_START(tcp_cubic_kfunc_ids)
+BTF_SET_START(tcp_cubic_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, cubictcp_init)
@@ -496,9 +496,12 @@ BTF_ID(func, cubictcp_cwnd_event)
BTF_ID(func, cubictcp_acked)
#endif
#endif
-BTF_SET_END(tcp_cubic_kfunc_ids)
+BTF_SET_END(tcp_cubic_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_cubic_check_kfunc_ids,
+};
static int __init cubictcp_register(void)
{
@@ -534,16 +537,14 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
- ret = tcp_register_congestion_control(&cubictcp);
- if (ret)
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&cubictcp);
}
static void __exit cubictcp_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
tcp_unregister_congestion_control(&cubictcp);
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 0d7ab3cc7b61..1943a6630341 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -238,7 +238,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.name = "dctcp-reno",
};
-BTF_SET_START(tcp_dctcp_kfunc_ids)
+BTF_SET_START(tcp_dctcp_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, dctcp_init)
@@ -249,25 +249,27 @@ BTF_ID(func, dctcp_cwnd_undo)
BTF_ID(func, dctcp_state)
#endif
#endif
-BTF_SET_END(tcp_dctcp_kfunc_ids)
+BTF_SET_END(tcp_dctcp_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_dctcp_check_kfunc_ids,
+};
static int __init dctcp_register(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
- ret = tcp_register_congestion_control(&dctcp);
- if (ret)
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&dctcp);
}
static void __exit dctcp_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
tcp_unregister_congestion_control(&dctcp);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bfe4112e000c..92e65d56dc2c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6703,7 +6703,8 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
- ireq->smc_ok = rx_opt->smc_ok;
+ ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+ tcp_sk(sk)->smc_hs_congested(sk));
#endif
}
@@ -6725,6 +6726,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
+ req->timeout = TCP_TIMEOUT_INIT;
}
return req;
@@ -6941,9 +6943,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
- if (!want_cookie)
- inet_csk_reqsk_queue_hash_add(sk, req,
- tcp_timeout_init((struct sock *)req));
+ if (!want_cookie) {
+ req->timeout = tcp_timeout_init((struct sock *)req);
+ inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+ }
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fec656f5a39e..6873f46fc8ba 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
+static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
return secure_tcp_seq(ip_hdr(skb)->daddr,
@@ -206,7 +208,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+ struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -810,7 +812,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
@@ -825,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
transmit_time);
ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
@@ -908,7 +912,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -921,6 +926,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
transmit_time);
ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
}
@@ -3111,41 +3117,18 @@ EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
- int cpu;
+ struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner);
-
- for_each_possible_cpu(cpu)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
- free_percpu(net->ipv4.tcp_sk);
+ if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
+ kfree(tcp_death_row);
}
static int __net_init tcp_sk_init(struct net *net)
{
- int res, cpu, cnt;
-
- net->ipv4.tcp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.tcp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(cpu) {
- struct sock *sk;
-
- res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
- IPPROTO_TCP, net);
- if (res)
- goto fail;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-
- /* Please enforce IP_DF and IPID==0 for RST and
- * ACK sent in SYN-RECV and TIME-WAIT state.
- */
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
-
- *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
- }
+ int cnt;
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
@@ -3172,9 +3155,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
+ net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
+ if (!net->ipv4.tcp_death_row)
+ return -ENOMEM;
+ refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
- net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+ net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
+ net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
@@ -3229,18 +3216,12 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_congestion_control = &tcp_reno;
return 0;
-fail:
- tcp_sk_exit(net);
-
- return res;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
- inet_twsk_purge(&tcp_hashinfo, AF_INET);
-
list_for_each_entry(net, net_exit_list, exit_list)
tcp_fastopen_ctx_destroy(net);
}
@@ -3326,6 +3307,24 @@ static void __init bpf_iter_register(void)
void __init tcp_v4_init(void)
{
+ int cpu, res;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, &init_net);
+ if (res)
+ panic("Failed to create the TCP control socket.\n");
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Please enforce IP_DF and IPID==0 for RST and
+ * ACK sent in SYN-RECV and TIME-WAIT state.
+ */
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+ per_cpu(ipv4_tcp_sk, cpu) = sk;
+ }
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c2d3ac2363a..6366df7aaf2a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -248,7 +248,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+ struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
tw = inet_twsk_alloc(sk, tcp_death_row, state);
@@ -583,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -622,8 +622,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
!inet_rtx_syn_ack(sk, req)) {
unsigned long expires = jiffies;
- expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
- TCP_RTO_MAX);
+ expires += reqsk_timeout(req, TCP_RTO_MAX);
if (!fastopen)
mod_timer_pending(&req->rsk_timer, expires);
else
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5079832af5c1..e76bf1e9251e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1960,7 +1960,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
bytes = min_t(unsigned long,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ sk->sk_gso_max_size);
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
@@ -4092,7 +4092,9 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl;
int res;
- tcp_rsk(req)->txhash = net_tx_rndhash();
+ /* Paired with WRITE_ONCE() in sock_setsockopt() */
+ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+ tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL);
if (!res) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 090360939401..6b4d8361560f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2093,16 +2093,20 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
+ int drop_reason;
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
+ if (rc == -ENOMEM) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
- else
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
@@ -2120,14 +2124,17 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
@@ -2204,8 +2211,10 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto drop;
+ }
udp_csum_pull_header(skb);
@@ -2213,11 +2222,12 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
return __udp_queue_rcv_skb(sk, skb);
csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return -1;
}