From a90c9347e90ed1e9323d71402ed18023bc910cd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:06 -0700 Subject: ipv6: addrlabel: per netns list Having a global list of labels do not scale to thousands of netns in the cloud era. This causes quadratic behavior on netns creation and deletion. This is time having a per netns list of ~10 labels. Tested: $ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ] real 0m20.837s # instead of 0m24.227s user 0m0.328s sys 0m20.338s # instead of 0m23.753s 16.17% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 12.30% ip [kernel.kallsyms] [k] netlink_has_listeners 6.76% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.78% ip [kernel.kallsyms] [k] memset_erms 5.77% ip [kernel.kallsyms] [k] kobject_uevent_env 5.18% ip [kernel.kallsyms] [k] refcount_sub_and_test 4.96% ip [kernel.kallsyms] [k] _raw_read_lock 3.82% ip [kernel.kallsyms] [k] refcount_inc_not_zero 3.33% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 2.11% ip [kernel.kallsyms] [k] unmap_page_range 1.77% ip [kernel.kallsyms] [k] __wake_up 1.69% ip [kernel.kallsyms] [k] strlen 1.17% ip [kernel.kallsyms] [k] __wake_up_common 1.09% ip [kernel.kallsyms] [k] insert_header 1.04% ip [kernel.kallsyms] [k] page_remove_rmap 1.01% ip [kernel.kallsyms] [k] consume_skb 0.98% ip [kernel.kallsyms] [k] netlink_trim 0.51% ip [kernel.kallsyms] [k] kernfs_link_sibling 0.51% ip [kernel.kallsyms] [k] filemap_map_pages 0.46% ip [kernel.kallsyms] [k] memcpy_erms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 81 +++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 51 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index b055bc79f56d..c6311d7108f6 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -30,7 +30,6 @@ * Policy Table */ struct ip6addrlbl_entry { - possible_net_t lbl_net; struct in6_addr prefix; int prefixlen; int ifindex; @@ -41,19 +40,6 @@ struct ip6addrlbl_entry { struct rcu_head rcu; }; -static struct ip6addrlbl_table -{ - struct hlist_head head; - spinlock_t lock; - u32 seq; -} ip6addrlbl_table; - -static inline -struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) -{ - return read_pnet(&lbl->lbl_net); -} - /* * Default policy table (RFC6724 + extensions) * @@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) } /* Find label */ -static bool __ip6addrlbl_match(struct net *net, - const struct ip6addrlbl_entry *p, +static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, int addrtype, int ifindex) { - if (!net_eq(ip6addrlbl_net(p), net)) - return false; if (p->ifindex && p->ifindex != ifindex) return false; if (p->addrtype && p->addrtype != addrtype) @@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, int type, int ifindex) { struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (__ip6addrlbl_match(net, p, addr, type, ifindex)) + + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(p, addr, type, ifindex)) return p; } return NULL; @@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net, } /* allocate one entry */ -static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, - const struct in6_addr *prefix, +static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label) { @@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - write_pnet(&newp->lbl_net, net); refcount_set(&newp->refcnt, 1); return newp; } /* add a label */ -static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, + int replace) { - struct hlist_node *n; struct ip6addrlbl_entry *last = NULL, *p = NULL; + struct hlist_node *n; int ret = 0; ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, replace); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && - net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && p->ifindex == newp->ifindex && ipv6_addr_equal(&p->prefix, &newp->prefix)) { if (!replace) { @@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) if (last) hlist_add_behind_rcu(&newp->list, &last->list); else - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - ip6addrlbl_table.seq++; + net->ipv6.ip6addrlbl_table.seq++; return ret; } @@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net, __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); - newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); + newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) return PTR_ERR(newp); - spin_lock(&ip6addrlbl_table.lock); - ret = __ip6addrlbl_add(newp, replace); - spin_unlock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(net, newp, replace); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) ip6addrlbl_free(newp); return ret; @@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net, ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && - net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); @@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net, __func__, prefix, prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); - spin_lock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); return ret; } @@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net) ADDRLABEL(KERN_DEBUG "%s\n", __func__); + spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); + INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { int ret = ip6addrlbl_add(net, ip6addrlbl_init_table[i].prefix, @@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) struct hlist_node *n; /* Remove all labels belonging to the exiting net */ - spin_lock(&ip6addrlbl_table.lock); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { - if (net_eq(ip6addrlbl_net(p), net)) { - hlist_del_rcu(&p->list); - ip6addrlbl_put(p); - } + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); } - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } static struct pernet_operations ipv6_addr_label_ops = { @@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = { int __init ipv6_addr_label_init(void) { - spin_lock_init(&ip6addrlbl_table.lock); - return register_pernet_subsys(&ipv6_addr_label_ops); } @@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) int err; rcu_read_lock(); - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (idx >= s_idx && - net_eq(ip6addrlbl_net(p), net)) { + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); if (p && !ip6addrlbl_hold(p)) p = NULL; - lseq = ip6addrlbl_table.seq; + lseq = net->ipv6.ip6addrlbl_table.seq; rcu_read_unlock(); if (!p) { -- cgit v1.2.3 From bb401caefe9d2c65e0c0fa23b21deecfbfa473fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:08 -0700 Subject: ipv6: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 110 267 5504 1 2 : tunables 8 4 0 : slabdata 110 267 0 real 3m25.292s user 0m0.644s sys 0m40.153s After patch: $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_tunnel.c | 20 +++++++++++--------- net/ipv6/ip6_vti.c | 23 ++++++++++++++--------- net/ipv6/sit.c | 9 ++++++--- 4 files changed, 36 insertions(+), 24 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..c82d41ef25e2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1155,19 +1155,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_net(struct net *net) +static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) { + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip6gre_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + ip6gre_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit = ip6gre_exit_net, + .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..3d6df489b39f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2167,17 +2167,16 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; - LIST_HEAD(list); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) - unregister_netdevice_queue(dev, &list); + unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); @@ -2186,12 +2185,10 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } - - unregister_netdevice_many(&list); } static int __net_init ip6_tnl_init_net(struct net *net) @@ -2235,16 +2232,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_net(struct net *net) +static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) { + struct net *net; + LIST_HEAD(list); + rtnl_lock(); - ip6_tnl_destroy_tunnels(net); + list_for_each_entry(net, net_list, exit_list) + ip6_tnl_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit = ip6_tnl_exit_net, + .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..714914d1bb98 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1052,23 +1052,22 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, + struct list_head *list) { int h; struct ip6_tnl *t; - LIST_HEAD(list); for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); - unregister_netdevice_queue(t->dev, &list); - unregister_netdevice_many(&list); + unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) @@ -1108,18 +1107,24 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_net(struct net *net) +static void __net_exit vti6_exit_batch_net(struct list_head *net_list) { - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + struct net *net; + LIST_HEAD(list); rtnl_lock(); - vti6_destroy_tunnels(ip6n); + list_for_each_entry(net, net_list, exit_list) { + ip6n = net_generic(net, vti6_net_id); + vti6_destroy_tunnels(ip6n, &list); + } + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit = vti6_exit_net, + .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index ac912bb21747..a799f5258614 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1848,19 +1848,22 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_net(struct net *net) +static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); + struct net *net; rtnl_lock(); - sit_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + sit_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit = sit_exit_net, + .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; -- cgit v1.2.3 From d4e1b299ec2853dd3d90b71ae86fa2626e60dc25 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 20 Sep 2017 12:18:17 -0400 Subject: ipv6: Use ipv6_authlen for len in ipv6_skip_exthdr In ipv6_skip_exthdr, the lengh of AH header is computed manually as (hp->hdrlen+2)<<2. However, in include/linux/ipv6.h, a macro named ipv6_authlen is already defined for exactly the same job. This commit replaces the manual computation code with the macro. Signed-off-by: Xiang Gao Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 305e2ed730bf..115d60919f72 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -99,7 +99,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hp->hdrlen+2)<<2; + hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); -- cgit v1.2.3 From 63a4e80be4f523dbde6412decfa8244ff73cc4b9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:22:31 +0200 Subject: ipv6: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96861c702c06..13c3b697f8c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3297,7 +3297,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (unlikely(IS_ERR(rt))) + if (IS_ERR(rt)) return PTR_ERR(rt); /* ifp->rt can be accessed outside of rtnl */ -- cgit v1.2.3 From 1f372c7bfb23286d2bf4ce0423ab488e86b74bb2 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 25 Sep 2017 22:01:36 +0100 Subject: net: ipv6: send NS for DAD when link operationally up The NS for DAD are sent on admin up as long as a valid qdisc is found. A race condition exists by which these packets will not egress the interface if the operational state of the lower device is not yet up. The solution is to delay DAD until the link is operationally up according to RFC2863. Rather than only doing this, follow the existing code checks by deferring IPv6 device initialization altogether. The fix allows DAD on devices like tunnels that are controlled by userspace control plane. The fix has no impact on regular deployments, but means that there is no IPv6 connectivity until the port has been opened in the case of port-based network access control, which should be desirable. Signed-off-by: Mike Manning Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 13c3b697f8c0..f553f72d0bee 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .disable_policy = 0, }; -/* Check if a valid qdisc is available */ -static inline bool addrconf_qdisc_ok(const struct net_device *dev) +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool addrconf_link_ready(const struct net_device *dev) { - return !qdisc_tx_is_noop(dev); + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); } static void addrconf_del_rs_timer(struct inet6_dev *idev) @@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->token = in6addr_any; - if (netif_running(dev) && addrconf_qdisc_ok(dev)) + if (netif_running(dev) && addrconf_link_ready(dev)) ndev->if_flags |= IF_READY; ipv6_mc_init_dev(ndev); @@ -3403,7 +3403,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, /* restore routes for permanent addresses */ addrconf_permanent_addr(dev); - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); @@ -3418,7 +3418,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } } else if (event == NETDEV_CHANGE) { - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is still not ready. */ break; } -- cgit v1.2.3 From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 26 Sep 2017 21:32:42 -0700 Subject: net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable even on IPv6 sockets. However, it turns out it is perfectly reasonable to want to set freebind on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP socket option on such a socket (they're all blindly errored out). One use case for this is to allow spoofing src ip on a raw socket via sendmsg cmsg. Tested: built, and booted # python >>> import socket >>> SOL_IP = socket.SOL_IP >>> SOL_IPV6 = socket.IPPROTO_IPV6 >>> IP_FREEBIND = 15 >>> IPV6_FREEBIND = 78 >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 0 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 0 >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 1 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 1 Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'net/ipv6') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 46444f8fbee4..4f8f3eb0699f 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -284,6 +284,7 @@ struct in6_flowlabel_req { #define IPV6_TRANSPARENT 75 #define IPV6_UNICAST_IF 76 #define IPV6_RECVFRAGSIZE 77 +#define IPV6_FREEBIND 78 /* * Multicast Routing: diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a5e466d4e093..b9404feabd78 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_FREEBIND: + if (optlen < sizeof(int)) + goto e_inval; + /* we also don't have a separate freebind bit for IPV6 */ + inet_sk(sk)->freebind = valbool; + retv = 0; + break; + case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; @@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = inet_sk(sk)->transparent; break; + case IPV6_FREEBIND: + val = inet_sk(sk)->freebind; + break; + case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; -- cgit v1.2.3 From 5c45121dc39026ab2139910e57cf933fd57d30f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:58:49 +0200 Subject: rtnetlink: remove __rtnl_af_unregister switch the only caller to rtnl_af_unregister. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 -- net/core/rtnetlink.c | 14 +------------- net/ipv6/addrconf.c | 4 ++-- 3 files changed, 3 insertions(+), 17 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 6520993ff449..e3ca8e2e3103 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -151,8 +151,6 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void __rtnl_af_unregister(struct rtnl_af_ops *ops); - void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b63c5759641f..3fb1ca33cba4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -475,18 +475,6 @@ void rtnl_af_register(struct rtnl_af_ops *ops) } EXPORT_SYMBOL_GPL(rtnl_af_register); -/** - * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. - * @ops: struct rtnl_af_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_af_unregister(struct rtnl_af_ops *ops) -{ - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_af_unregister); - /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister @@ -494,7 +482,7 @@ EXPORT_SYMBOL_GPL(__rtnl_af_unregister); void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - __rtnl_af_unregister(ops); + list_del(&ops->list); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f553f72d0bee..837418ff2d4b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6618,9 +6618,9 @@ void addrconf_cleanup(void) unregister_pernet_subsys(&addrconf_ops); ipv6_addr_label_cleanup(); - rtnl_lock(); + rtnl_af_unregister(&inet6_ops); - __rtnl_af_unregister(&inet6_ops); + rtnl_lock(); /* clean dev list */ for_each_netdev(&init_net, dev) { -- cgit v1.2.3 From 380537b4f7f2e706e499695b38eabc90a5f72fa8 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 02:07:08 +0800 Subject: net: ipv6: remove unused code in ipv6_find_hdr() Storing the left length of skb into 'len' actually has no effect so we can remove it. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 115d60919f72..11025f8d124b 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -187,7 +187,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len; bool found; if (fragoff) @@ -204,7 +203,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } - len = skb->len - start; do { struct ipv6_opt_hdr _hdr, *hp; @@ -273,7 +271,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, if (!found) { nexthdr = hp->nexthdr; - len -= hdrlen; start += hdrlen; } } while (!found); -- cgit v1.2.3 From cc71b7b071192ac1c288e272fdc3f3877eb96663 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Thu, 5 Oct 2017 15:45:32 -0400 Subject: net/ipv6: remove unused err variable on icmpv6_push_pending_frames int err is unused by icmpv6_push_pending_frames(), this patch returns removes the variable and returns the function with 0. git bisect shows this variable has been around since linux has been in git in commit 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2. This was found by running make coccicheck M=net/ipv6/ on linus' tree on commit 77ede3a014a32746002f7889211f0cecf4803163 (current HEAD as of this patch). Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5acb54405b10..aeb49b4d8c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -255,7 +255,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, { struct sk_buff *skb; struct icmp6hdr *icmp6h; - int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) @@ -288,7 +287,7 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } ip6_push_pending_frames(sk); out: - return err; + return 0; } struct icmpv6_msg { -- cgit v1.2.3 From 4e64b1ed15e25b8dcc2819c6d43dab72eb0bea26 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Oct 2017 23:46:14 -0700 Subject: net/ipv6: Convert icmpv6_push_pending_frames to void commit cc71b7b07119 ("net/ipv6: remove unused err variable on icmpv6_push_pending_frames") exposed icmpv6_push_pending_frames return value not being used. Remove now unnecessary int err declarations and uses. Miscellanea: o Remove unnecessary goto and out: labels o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/ipv6.h | 4 ++-- net/ipv6/icmp.c | 43 ++++++++++++++++++------------------------- net/ipv6/ping.c | 5 ++--- 3 files changed, 22 insertions(+), 30 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6eac5cf8f1e6..3cda3b521c36 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len); +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index aeb49b4d8c7d..4e52d52a6752 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -250,15 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset) return (*op & 0xC0) == 0x80; } -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len) +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; skb = skb_peek(&sk->sk_write_queue); if (!skb) - goto out; + return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); @@ -286,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, tmp_csum); } ip6_push_pending_frames(sk); -out: - return 0; } struct icmpv6_msg { @@ -437,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, int iif = 0; int addr_type = 0; int len; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || @@ -574,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, rcu_read_lock(); idev = __in6_dev_get(skb->dev); - err = ip6_append_data(sk, icmpv6_getfrag, &msg, - len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused); - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), + &ipc6, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, &sockc_unused)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); } rcu_read_unlock(); out_dst_release: @@ -681,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); struct sockcm_cookie sockc_unused = {0}; @@ -718,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(net, sk, &dst, &fl6); - if (err) + if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) @@ -736,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; - err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused); - - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), &ipc6, &fl6, + (struct rt6_info *)dst, MSG_DONTWAIT, + &sockc_unused)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - skb->len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); } dst_release(dst); out: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ac826dd338ff..d12c55dad7d1 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, - (struct icmp6hdr *) &pfh.icmph, - len); + icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *)&pfh.icmph, len); } release_sock(sk); -- cgit v1.2.3 From 180ca444b985c42948fa26abd278e616b5ce7eb2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:56 -0700 Subject: ipv6: introduce a new function fib6_update_sernum() This function takes a route as input and tries to update the sernum in the fib6_node this route is associated with. It will be used in later commit when adding a cached route into the exception table under that route. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ net/ipv6/ip6_fib.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d060d711a624..152b7b14a5a5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -358,6 +358,8 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); +void fib6_update_sernum(struct rt6_info *rt); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e5308d7cbd75..0ba4fbb2f855 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -110,6 +110,20 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; +void fib6_update_sernum(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct net *net = dev_net(rt->dst.dev); + struct fib6_node *fn; + + write_lock_bh(&table->tb6_lock); + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&table->tb6_lock)); + if (fn) + fn->fn_sernum = fib6_new_sernum(net); + write_unlock_bh(&table->tb6_lock); +} + /* * Auxiliary address test functions for the radix tree. * -- cgit v1.2.3 From 35732d01fe311ec13c4e42936878b782b8e7ea85 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:57 -0700 Subject: ipv6: introduce a hash table to store dst cache Add a hash table into struct rt6_info in order to store dst caches created by pmtu discovery and ip redirect in ipv6 routing code. APIs to add dst cache, delete dst cache, find dst cache and update dst cache in the hash table are implemented and will be used in later commits. This is a preparation work to move all cache routes into the exception table instead of getting inserted into the fib6 tree. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 19 +++ include/net/ip6_route.h | 3 + net/ipv6/route.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 363 insertions(+) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 152b7b14a5a5..c4864c1e8f13 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -98,6 +98,22 @@ struct rt6key { struct fib6_table; +struct rt6_exception_bucket { + struct hlist_head chain; + int depth; +}; + +struct rt6_exception { + struct hlist_node hlist; + struct rt6_info *rt6i; + unsigned long stamp; + struct rcu_head rcu; +}; + +#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 +#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) +#define FIB6_MAX_DEPTH 5 + struct rt6_info { struct dst_entry dst; @@ -134,12 +150,15 @@ struct rt6_info { struct inet6_dev *rt6i_idev; struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 exception_bucket_flushed:1, + unused:7; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ee96f402cb75..3315605f34c9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -95,6 +95,9 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); +void rt6_flush_exceptions(struct rt6_info *rt); +int rt6_remove_exception_rt(struct rt6_info *rt); + static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, unsigned int prefs, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..dc5e70975966 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -392,6 +396,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_exception_bucket *bucket; struct dst_entry *from = dst->from; struct inet6_dev *idev; @@ -404,6 +409,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); + if (bucket) { + rt->rt6i_exception_bucket = NULL; + kfree(bucket); + } dst->from = NULL; dst_release(from); @@ -1091,6 +1101,337 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } +/* exception hash table implementation + */ +static DEFINE_SPINLOCK(rt6_exception_lock); + +/* Remove rt6_ex from hash table and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_remove_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex) +{ + if (!bucket || !rt6_ex) + return; + rt6_ex->rt6i->rt6i_node = NULL; + hlist_del_rcu(&rt6_ex->hlist); + rt6_release(rt6_ex->rt6i); + kfree_rcu(rt6_ex, rcu); + WARN_ON_ONCE(!bucket->depth); + bucket->depth--; +} + +/* Remove oldest rt6_ex in bucket and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) +{ + struct rt6_exception *rt6_ex, *oldest = NULL; + + if (!bucket) + return; + + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) + oldest = rt6_ex; + } + rt6_remove_exception(bucket, oldest); +} + +static u32 rt6_exception_hash(const struct in6_addr *dst, + const struct in6_addr *src) +{ + static u32 seed __read_mostly; + u32 val; + + net_get_random_once(&seed, sizeof(seed)); + val = jhash(dst, sizeof(*dst), seed); + +#ifdef CONFIG_IPV6_SUBTREES + if (src) + val = jhash(src, sizeof(*src), val); +#endif + return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rt6_exception_lock + */ +static struct rt6_exception * +__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rcu_read_lock() + */ +static struct rt6_exception * +__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +static int rt6_insert_exception(struct rt6_info *nrt, + struct rt6_info *ort) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err = 0; + + /* ort can't be a cache or pcpu route */ + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; + WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); + + spin_lock_bh(&rt6_exception_lock); + + if (ort->exception_bucket_flushed) { + err = -EINVAL; + goto out; + } + + bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) { + bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), + GFP_ATOMIC); + if (!bucket) { + err = -ENOMEM; + goto out; + } + rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + } + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates ort is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (ort->rt6i_src.plen) + src_key = &nrt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_remove_exception(bucket, rt6_ex); + + rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); + if (!rt6_ex) { + err = -ENOMEM; + goto out; + } + rt6_ex->rt6i = nrt; + rt6_ex->stamp = jiffies; + atomic_inc(&nrt->rt6i_ref); + nrt->rt6i_node = ort->rt6i_node; + hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); + bucket->depth++; + + if (bucket->depth > FIB6_MAX_DEPTH) + rt6_exception_remove_oldest(bucket); + +out: + spin_unlock_bh(&rt6_exception_lock); + + /* Update fn->fn_sernum to invalidate all cached dst */ + if (!err) + fib6_update_sernum(ort); + + return err; +} + +void rt6_flush_exceptions(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + spin_lock_bh(&rt6_exception_lock); + /* Prevent rt6_insert_exception() to recreate the bucket list */ + rt->exception_bucket_flushed = 1; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) + goto out; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) + rt6_remove_exception(bucket, rt6_ex); + WARN_ON_ONCE(bucket->depth); + bucket++; + } + +out: + spin_unlock_bh(&rt6_exception_lock); +} + +/* Find cached rt in the hash table inside passed in rt + * Caller has to hold rcu_read_lock() + */ +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + struct rt6_info *res = NULL; + + bucket = rcu_dereference(rt->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates rt is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (rt->rt6i_src.plen) + src_key = saddr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + res = rt6_ex->rt6i; + + return res; +} + +/* Remove the passed in cached rt from the hash table that contains it */ +int rt6_remove_exception_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err; + + if (!from || + !(rt->rt6i_flags | RTF_CACHE)) + return -EINVAL; + + if (!rcu_access_pointer(from->rt6i_exception_bucket)) + return -ENOENT; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(from->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) { + rt6_remove_exception(bucket, rt6_ex); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&rt6_exception_lock); + return err; +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + + if (!from || + !(rt->rt6i_flags | RTF_CACHE)) + return; + + rcu_read_lock(); + bucket = rcu_dereference(from->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_ex->stamp = jiffies; + + rcu_read_unlock(); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { -- cgit v1.2.3 From 60006a4825f9e71adf770745e17790c6e6c97a89 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:58 -0700 Subject: ipv6: prepare fib6_remove_prefsrc() for exception table After we move cached dst entries into the exception table under its parent route, current fib6_remove_prefsrc() no longer can access them. This commit makes fib6_remove_prefsrc() also go through all routes in the exception table to remove the pref src. This is a preparation patch in order to move all cached dst into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dc5e70975966..f52ac57dcc99 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1264,6 +1264,12 @@ static int rt6_insert_exception(struct rt6_info *nrt, if (ort->rt6i_src.plen) src_key = &nrt->rt6i_src.addr; #endif + + /* Update rt6i_prefsrc as it could be changed + * in rt6_remove_prefsrc() + */ + nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) @@ -1432,6 +1438,25 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } +static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + rt6_ex->rt6i->rt6i_prefsrc.plen = 0; + } + bucket++; + } + } +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3159,8 +3184,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) if (((void *)rt->dst.dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->rt6i_prefsrc.plen = 0; + /* need to update cache as well */ + rt6_exceptions_remove_prefsrc(rt); + spin_unlock_bh(&rt6_exception_lock); } return 0; } -- cgit v1.2.3 From f5bbe7ee79c2842ca69f25afd0b6b65a9011b735 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:59 -0700 Subject: ipv6: prepare rt6_mtu_change() for exception table If we move all cached dst into the exception table under the main route, current rt6_mtu_change() will no longer be able to access them. This commit makes rt6_mtu_change_route() function to also go through all cached routes in the exception table under the main route and do proper updates on the mtu. This is a preparation in order to move all cached routes into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f52ac57dcc99..d9805a857809 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1269,6 +1269,14 @@ static int rt6_insert_exception(struct rt6_info *nrt, * in rt6_remove_prefsrc() */ nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + /* rt6_mtu_change() might lower mtu on ort. + * Only insert this exception route if its mtu + * is less than ort's mtu value. + */ + if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + err = -EINVAL; + goto out; + } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); @@ -1457,6 +1465,32 @@ static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) } } +static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) + entry->rt6i_pmtu = mtu; + } + bucket++; + } + } +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3296,6 +3330,10 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { + spin_lock_bh(&rt6_exception_lock); + /* This case will be removed once the exception table + * is hooked up. + */ if (rt->rt6i_flags & RTF_CACHE) { /* For RTF_CACHE with rt6i_pmtu == 0 * (i.e. a redirected route), @@ -3309,6 +3347,8 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } + rt6_exceptions_update_pmtu(rt, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); } return 0; } -- cgit v1.2.3 From b16cb459d77800dcb886b5e73e1beafd3d596897 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:00 -0700 Subject: ipv6: prepare rt6_clean_tohost() for exception table If we move all cached dst into the exception table under the main route, current rt6_clean_tohost() will no longer be able to access them. This commit makes fib6_clean_tohost() to also go through all cached routes in exception table and removes cached gateway routes to the passed in gateway. This is a preparation in order to move all cached routes into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d9805a857809..e8e901589564 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1491,6 +1491,43 @@ static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) } } +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +static void rt6_exceptions_clean_tohost(struct rt6_info *rt, + struct in6_addr *gateway) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == + RTF_CACHE_GATEWAY && + ipv6_addr_equal(gateway, + &entry->rt6i_gateway)) { + rt6_remove_exception(bucket, rt6_ex); + } + } + bucket++; + } + } + + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3240,18 +3277,27 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) -#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + /* RTF_CACHE_GATEWAY case will be removed once the exception + * table is hooked up to store all cached routes. + */ if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } + + /* Further clean up cached routes in exception table. + * This is needed because cached route may have a different + * gateway than its 'parent' in the case of an ip redirect. + */ + rt6_exceptions_clean_tohost(rt, gateway); + return 0; } -- cgit v1.2.3 From c757faa8bfa26a0dd24b41ff783e0da042156887 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:01 -0700 Subject: ipv6: prepare fib6_age() for exception table If all dst cache entries are stored in the exception table under the main route, we have to go through them during fib6_age() when doing garbage collecting. Introduce a new function rt6_age_exception() which goes through all dst entries in the exception table and remove those entries that are expired. This function is called in fib6_age() so that all dst caches are also garbage collected. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 13 +++++++++++ include/net/ip6_route.h | 2 ++ net/ipv6/ip6_fib.c | 26 ++++++++------------- net/ipv6/route.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 17 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c4864c1e8f13..11a79ef87a28 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -29,6 +29,14 @@ #define FIB6_TABLE_HASHSZ 1 #endif +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) pr_debug(x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + struct rt6_info; struct fib6_config { @@ -75,6 +83,11 @@ struct fib6_node { struct rcu_head rcu; }; +struct fib6_gc_args { + int timeout; + int more; +}; + #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3315605f34c9..a0087fb9864b 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -97,6 +97,8 @@ int ip6_del_rt(struct rt6_info *); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); +void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0ba4fbb2f855..3afbe50f2779 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -38,14 +38,6 @@ #include #include -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { @@ -1890,12 +1882,6 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -struct fib6_gc_args -{ - int timeout; - int more; -}; - static int fib6_age(struct rt6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; @@ -1904,9 +1890,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) /* * check addrconf expiration here. * Routes are expired even if they are in use. - * - * Also age clones. Note, that clones are aged out - * only if they are not in use now. */ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { @@ -1915,6 +1898,9 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; + /* The following part will soon be removed when the exception + * table is hooked up to store all cached routes. + */ } else if (rt->rt6i_flags & RTF_CACHE) { if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1940,6 +1926,12 @@ static int fib6_age(struct rt6_info *rt, void *arg) gc_args->more++; } + /* Also age clones in the exception table. + * Note, that clones are aged out + * only if they are not in use now. + */ + rt6_age_exceptions(rt, gc_args, now); + return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e8e901589564..d2dd55f58b5d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1528,6 +1528,66 @@ static void rt6_exceptions_clean_tohost(struct rt6_info *rt, spin_unlock_bh(&rt6_exception_lock); } +static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_info *rt = rt6_ex->rt6i; + + if (atomic_read(&rt->dst.__refcnt) == 1 && + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } + gc_args->more++; +} + +void rt6_age_exceptions(struct rt6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + rt6_age_examine_exception(bucket, rt6_ex, + gc_args, now); + } + bucket++; + } + } + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { -- cgit v1.2.3 From 38fbeeeeccdb38d0635398e8e344d245f6d8dc52 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:02 -0700 Subject: ipv6: prepare fib6_locate() for exception table fib6_locate() is used to find the fib6_node according to the passed in prefix address key. It currently tries to find the fib6_node with the exact match of the passed in key. However, when we move cached routes into the exception table, fib6_locate() will fail to find the fib6_node for it as the cached routes will be stored in the exception table under the fib6_node with the longest prefix match of the cache's dst addr key. This commit adds a new parameter to let the caller specify if it needs exact match or longest prefix match. Right now, all callers still does exact match when calling fib6_locate(). It will be changed in later commit where exception table is hooked up to store cached routes. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 30 +++++++++++++++++++++++------- net/ipv6/route.c | 5 +++-- 4 files changed, 29 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 11a79ef87a28..4497a1eb4d41 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -357,7 +357,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len); + const struct in6_addr *saddr, int src_len, + bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 837418ff2d4b..3ccaf52824c9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2322,7 +2322,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3afbe50f2779..b3e4cf0962f8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1343,14 +1343,21 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) + * exact_match == true means we try to find fn with exact match of + * the passed in prefix addr + * exact_match == false means we try to find fn with longest prefix + * match of the passed in prefix addr. This is useful for finding fn + * for cached route as it will be stored in the exception table under + * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, - int plen, int offset) + int plen, int offset, + bool exact_match) { - struct fib6_node *fn; + struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); @@ -1360,11 +1367,13 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) - return NULL; + goto out; if (plen == fn->fn_bit) return fn; + prev = fn; + /* * We have more bits to go */ @@ -1373,24 +1382,31 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, else fn = fn->left; } - return NULL; +out: + if (exact_match) + return NULL; + else + return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) + const struct in6_addr *saddr, int src_len, + bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst)); + offsetof(struct rt6_info, rt6i_dst), + exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn && fn->subtree) fn = fib6_locate_1(fn->subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src)); + offsetof(struct rt6_info, rt6i_src), + exact_match); } #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d2dd55f58b5d..855b4ceec349 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2800,7 +2800,8 @@ static int ip6_route_del(struct fib6_config *cfg, fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, - &cfg->fc_src, cfg->fc_src_len); + &cfg->fc_src, cfg->fc_src_len, + true); if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { @@ -3009,7 +3010,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; -- cgit v1.2.3 From 2b760fcf5cfb34e8610df56d83745b2b74ae1379 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:03 -0700 Subject: ipv6: hook up exception table to store dst cache This commit makes use of the exception hash table implementation to store dst caches created by pmtu discovery and ip redirect into the hash table under the rt_info and no longer inserts these routes into fib6 tree. This makes the fib6 tree only contain static configured routes and could now be protected by rcu instead of a rw lock. With this change, in the route lookup related functions, after finding the rt6_info with the longest prefix, we also need to search for the exception table before doing backtracking. In the route delete function, if the route being deleted is not a dst cache, deletion of this route also need to flush the whole hash table under it. If it is a dst cache, then only delete the cached dst in the hash table. Note: for fib6_walk_continue() function, w->root now is always pointing to a root node considering that fib6_prune_clones() is removed from the code. So we add a WARN_ON() msg to make sure w->root always points to a root node and also removed the update of w->root in fib6_repair_tree(). This is a prerequisite for later patch because we don't need to make w->root as rcu protected when replacing rwlock with RCU. Also, we remove all prune related variables as it is no longer used. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - net/ipv6/addrconf.c | 1 - net/ipv6/ip6_fib.c | 95 ++++++++------------------------------------ net/ipv6/route.c | 108 +++++++++++++++++++++++++------------------------- 4 files changed, 72 insertions(+), 133 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4497a1eb4d41..d0b7283073e3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -280,7 +280,6 @@ struct fib6_walker { struct fib6_node *root, *node; struct rt6_info *leaf; enum fib6_walk_state state; - bool prune; unsigned int skip; unsigned int count; int (*func)(struct fib6_walker *); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3ccaf52824c9..873afafddfc4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2326,7 +2326,6 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!fn) goto out; - noflags |= RTF_CACHE; for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->dst.dev->ifindex != dev->ifindex) continue; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b3e4cf0962f8..9c8e704e6af7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -54,7 +54,6 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); @@ -1101,6 +1100,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; + if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1192,11 +1193,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, #endif err = fib6_add_rt2node(fn, rt, info, mxc); - if (!err) { + if (!err) fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn); - } out: if (err) { @@ -1511,19 +1509,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { - if (w->root == fn) { - w->root = w->node = NULL; - RT6_TRACE("W %p adjusted by delroot 1\n", w); - } else if (w->node == fn) { + if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { - if (w->root == fn) { - w->root = child; - RT6_TRACE("W %p adjusted by delroot 2\n", w); - } if (w->node == fn) { w->node = child; if (children&2) { @@ -1557,12 +1548,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, RT6_TRACE("fib6_del_route\n"); + WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); + /* Unlink it */ *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); + /* Reset round-robin state, if necessary */ if (fn->rr_ptr == rt) fn->rr_ptr = NULL; @@ -1625,18 +1621,9 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags & RTF_CACHE)) { - struct fib6_node *pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - /* clones of this route might be in another subtree */ - if (rt->rt6i_src.plen) { - while (!(pn->fn_flags & RTN_ROOT)) - pn = pn->parent; - pn = pn->parent; - } -#endif - fib6_prune_clones(info->nl_net, pn); - } + /* remove cached dst from exception table */ + if (rt->rt6i_flags & RTF_CACHE) + return rt6_remove_exception_rt(rt); /* * Walk the leaf entries looking for ourself @@ -1679,16 +1666,14 @@ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn; + /* w->root should always be table->tb6_root */ + WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); + for (;;) { fn = w->node; if (!fn) return 0; - if (w->prune && fn != w->root && - fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { - w->state = FWS_C; - w->leaf = fn->leaf; - } switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: @@ -1820,20 +1805,16 @@ static int fib6_clean_node(struct fib6_walker *w) * func is called on each route. * It may return -1 -> delete this route. * 0 -> continue walking - * - * prune==1 -> only immediate children of node (certainly, - * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - bool prune, int sernum, void *arg) + int sernum, void *arg) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; - c.w.prune = prune; c.w.count = 0; c.w.skip = 0; c.func = func; @@ -1858,7 +1839,7 @@ static void __fib6_clean_all(struct net *net, hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, false, sernum, arg); + func, sernum, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1871,22 +1852,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); } -static int fib6_prune_clone(struct rt6_info *rt, void *arg) -{ - if (rt->rt6i_flags & RTF_CACHE) { - RT6_TRACE("pruning clone %p\n", rt); - return -1; - } - - return 0; -} - -static void fib6_prune_clones(struct net *net, struct fib6_node *fn) -{ - fib6_clean_tree(net, fn, fib6_prune_clone, true, - FIB6_NO_SERNUM_CHANGE, NULL); -} - static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); @@ -1914,32 +1879,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; - /* The following part will soon be removed when the exception - * table is hooked up to store all cached routes. - */ - } else if (rt->rt6i_flags & RTF_CACHE) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) - rt->dst.obsolete = DST_OBSOLETE_KILL; - if (atomic_read(&rt->dst.__refcnt) == 1 && - rt->dst.obsolete == DST_OBSOLETE_KILL) { - RT6_TRACE("aging clone %p\n", rt); - return -1; - } else if (rt->rt6i_flags & RTF_GATEWAY) { - struct neighbour *neigh; - __u8 neigh_flags = 0; - - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { - neigh_flags = neigh->flags; - neigh_release(neigh); - } - if (!(neigh_flags & NTF_ROUTER)) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; - } - } - gc_args->more++; } /* Also age clones in the exception table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 855b4ceec349..65130dde276a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -878,8 +878,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) { + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - struct rt6_info *rt; read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); @@ -893,6 +893,11 @@ restart: if (fn) goto restart; } + /* Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); @@ -1592,7 +1597,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1624,6 +1629,10 @@ redo_rt6_select: } } + /*Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { dst_use(&rt->dst, jiffies); @@ -1988,23 +1997,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); + /* update rt6_ex->stamp for cache */ + if (rt6->rt6i_flags & RTF_CACHE) + rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct rt6_info *nrt6; nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - - /* ip6_ins_rt(nrt6) will bump the - * rt6->rt6i_node->fn_sernum - * which will fail the next rt6_check() and - * invalidate the sk->sk_dst_cache. - */ - ip6_ins_rt(nrt6); - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt6->dst); + if (rt6_insert_exception(nrt6, rt6)) + dst_release_immediate(&nrt6->dst); } } } @@ -2068,7 +2071,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2093,8 +2096,23 @@ restart: continue; if (fl6->flowi6_oif != rt->dst.dev->ifindex) continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + rt_cache = rt6_find_cached_rt(rt, + &fl6->daddr, + &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(&rdfl->gateway, + &rt_cache->rt6i_gateway)) { + rt = rt_cache; + break; + } continue; + } break; } @@ -2785,9 +2803,9 @@ out_put: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { + struct rt6_info *rt, *rt_cache; struct fib6_table *table; struct fib6_node *fn; - struct rt6_info *rt; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); @@ -2801,13 +2819,17 @@ static int ip6_route_del(struct fib6_config *cfg, fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, - true); + !(cfg->fc_flags & RTF_CACHE)); if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { - if ((rt->rt6i_flags & RTF_CACHE) && - !(cfg->fc_flags & RTF_CACHE)) - continue; + if (cfg->fc_flags & RTF_CACHE) { + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + &cfg->fc_src); + if (!rt_cache) + continue; + rt = rt_cache; + } if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -2933,8 +2955,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - if (ip6_ins_rt(nrt)) - goto out_release; + /* No need to remove rt from the exception table if rt is + * a cached route because rt6_insert_exception() will + * takes care of it + */ + if (rt6_insert_exception(nrt, rt)) { + dst_release_immediate(&nrt->dst); + goto out; + } netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2942,17 +2970,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags & RTF_CACHE) { - rt = (struct rt6_info *) dst_clone(&rt->dst); - ip6_del_rt(rt); - } - -out_release: - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt->dst); - out: neigh_release(neigh); } @@ -3344,12 +3361,8 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - /* RTF_CACHE_GATEWAY case will be removed once the exception - * table is hooked up to store all cached routes. - */ - if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || - ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } @@ -3438,20 +3451,9 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); - /* This case will be removed once the exception table - * is hooked up. - */ - if (rt->rt6i_flags & RTF_CACHE) { - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) - rt->rt6i_pmtu = arg->mtu; - } else if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } rt6_exceptions_update_pmtu(rt, arg->mtu); -- cgit v1.2.3 From a94b9367e044ba672c9f4105eb1516ff6ff4948a Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:04 -0700 Subject: ipv6: grab rt->rt6i_ref before allocating pcpu rt After rwlock is replaced with rcu and spinlock, ip6_pol_route() will be called with only rcu held. That means rt6 route deletion could happen simultaneously with rt6_make_pcpu_rt(). This could potentially cause memory leak if rt6_release() is called right before rt6_make_pcpu_rt() on the same route. This patch grabs rt->rt6i_ref safely before calling rt6_make_pcpu_rt() to make sure rt6_release() will not get triggered while rt6_make_pcpu_rt() is in progress. And rt6_release() is called after rt6_make_pcpu_rt() is finished. Note: As we are incrementing rt->rt6i_ref in ip6_pol_route(), there is a very slim chance that fib6_purge_rt() will be triggered unnecessarily when deleting a route if ip6_pol_route() running on another thread picks this route as well and tries to make pcpu cache for it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 58 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 65130dde276a..941c062389d2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1070,7 +1070,6 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); @@ -1081,28 +1080,20 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return net->ipv6.ip6_null_entry; } - read_lock_bh(&table->tb6_lock); - if (rt->rt6i_pcpu) { - p = this_cpu_ptr(rt->rt6i_pcpu); - prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = prev; - } - } else { - /* rt has been removed from the fib6 tree - * before we have a chance to acquire the read_lock. - * In this case, don't brother to create a pcpu rt - * since rt is going away anyway. The next - * dst_check() will trigger a re-lookup. - */ + dst_hold(&pcpu_rt->dst); + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + /* release refcnt taken by ip6_rt_pcpu_alloc() */ + dst_release_immediate(&pcpu_rt->dst); + /* release refcnt taken by above dst_hold() */ dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = rt; + dst_hold(&prev->dst); + pcpu_rt = prev; } - dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); - read_unlock_bh(&table->tb6_lock); return pcpu_rt; } @@ -1683,19 +1674,28 @@ redo_rt6_select: if (pcpu_rt) { read_unlock_bh(&table->tb6_lock); } else { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - */ - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - pcpu_rt = rt6_make_pcpu_route(rt); - dst_release(&rt->dst); + /* atomic_inc_not_zero() is needed when using rcu */ + if (atomic_inc_not_zero(&rt->rt6i_ref)) { + /* We have to do the read_unlock first + * because rt6_make_pcpu_route() may trigger + * ip6_dst_gc() which will take the write_lock. + * + * No dst_hold() on rt is needed because grabbing + * rt->rt6i_ref makes sure rt can't be released. + */ + read_unlock_bh(&table->tb6_lock); + pcpu_rt = rt6_make_pcpu_route(rt); + rt6_release(rt); + } else { + /* rt is already removed from tree */ + read_unlock_bh(&table->tb6_lock); + pcpu_rt = net->ipv6.ip6_null_entry; + dst_hold(&pcpu_rt->dst); + } } trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; - } } EXPORT_SYMBOL_GPL(ip6_pol_route); -- cgit v1.2.3 From 51e398e86d61b69b9a4be49ff7f6afeb87530df1 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:05 -0700 Subject: ipv6: don't release rt->rt6i_pcpu memory during rt6_release() After rwlock is replaced with rcu and spinlock, route lookup can happen simultanously with route deletion. This patch removes the call to free_percpu(rt->rt6i_pcpu) from rt6_release() to avoid the race condition between rt6_release() and rt6_get_pcpu_route(). And as free_percpu(rt->rt6i_pcpu) is already called in ip6_dst_destroy() after the rcu grace period, it is safe to do this change. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9c8e704e6af7..eee392f7b1f6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -190,9 +190,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) *ppcpu_rt = NULL; } } - - free_percpu(non_pcpu_rt->rt6i_pcpu); - non_pcpu_rt->rt6i_pcpu = NULL; } EXPORT_SYMBOL_GPL(rt6_free_pcpu); -- cgit v1.2.3 From d3843fe5fd45be0e04a251a2cc68893c859a31bd Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:06 -0700 Subject: ipv6: replace dst_hold() with dst_hold_safe() in routing code With rwlock, it is safe to call dst_hold() in the read thread because read thread is guaranteed to be separated from write thread. However, after we replace rwlock with rcu, it is no longer safe to use dst_hold(). A dst might already have been deleted but is waiting for the rcu grace period to pass before freeing the memory when a read thread is trying to do dst_hold(). This could potentially cause double free issue. So this commit replaces all dst_hold() with dst_hold_safe() in all read thread to avoid this double free issue. And in order to make the code more compact, a new function ip6_hold_safe() is introduced. It calls dst_hold_safe() first, and if that fails, it will either fall back to hold and return net->ipv6.ip6_null_entry or set rt to NULL according to the caller's need. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- net/ipv6/route.c | 71 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 20 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 873afafddfc4..f86e931d555e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2333,7 +2333,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->rt6i_flags & noflags) != 0) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + rt = NULL; break; } out: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 941c062389d2..aeb349aea429 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -874,6 +874,23 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, + bool null_fallback) +{ + struct rt6_info *rt = *prt; + + if (dst_hold_safe(&rt->dst)) + return true; + if (null_fallback) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } else { + rt = NULL; + } + *prt = rt; + return false; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) @@ -898,7 +915,9 @@ restart: if (rt_cache) rt = rt_cache; - dst_use(&rt->dst, jiffies); + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -1061,10 +1080,9 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt) { - dst_hold(&pcpu_rt->dst); + if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) rt6_dst_from_metrics_check(pcpu_rt); - } + return pcpu_rt; } @@ -1625,12 +1643,17 @@ redo_rt6_select: if (rt_cache) rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { - dst_use(&rt->dst, jiffies); + if (rt == net->ipv6.ip6_null_entry) { + read_unlock_bh(&table->tb6_lock); + dst_hold(&rt->dst); + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + return rt; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + rt6_dst_from_metrics_check(rt); + } read_unlock_bh(&table->tb6_lock); - - rt6_dst_from_metrics_check(rt); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1643,7 +1666,13 @@ redo_rt6_select: struct rt6_info *uncached_rt; - dst_use(&rt->dst, jiffies); + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + } else { + read_unlock_bh(&table->tb6_lock); + uncached_rt = rt; + goto uncached_rt_out; + } read_unlock_bh(&table->tb6_lock); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); @@ -1659,6 +1688,7 @@ redo_rt6_select: dst_hold(&uncached_rt->dst); } +uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1667,8 +1697,7 @@ redo_rt6_select: struct rt6_info *pcpu_rt; - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_use_noref(&rt->dst, jiffies); pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { @@ -2130,7 +2159,7 @@ restart: } out: - dst_hold(&rt->dst); + ip6_hold_safe(net, &rt, true); read_unlock_bh(&table->tb6_lock); @@ -2841,7 +2870,8 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + break; read_unlock_bh(&table->tb6_lock); /* if gateway was specified only delete the one hop */ @@ -3038,7 +3068,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); break; } out: @@ -3096,7 +3126,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev break; } if (rt) - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); read_unlock_bh(&table->tb6_lock); return rt; } @@ -3139,9 +3169,12 @@ restart: for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - ip6_del_rt(rt); + if (dst_hold_safe(&rt->dst)) { + read_unlock_bh(&table->tb6_lock); + ip6_del_rt(rt); + } else { + read_unlock_bh(&table->tb6_lock); + } goto restart; } } -- cgit v1.2.3 From bbd63f06d114a52be33f6982fc89ca2768cdeb62 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:07 -0700 Subject: ipv6: update fn_sernum after route is inserted to tree fib6_add() logic currently calls fib6_add_1() to figure out what node should be used for the newly added route and then call fib6_add_rt2node() to insert the route to the node. And during the call of fib6_add_1(), fn_sernum is updated for all nodes that share the same prefix as the new route. This does not have issue in the current code because reader thread will not be able to access the tree while writer thread is inserting new route to it. However, it is not the case once we transition to use RCU. Reader thread could potentially see the new fn_sernum before the new route is inserted. As a result, reader thread's route lookup will return a stale route with the new fn_sernum. In order to solve this issue, we remove all the update of fn_sernum in fib6_add_1(), and instead, introduce a new function that updates fn_sernum for all related nodes and call this functions once the route is successfully inserted to the tree. Also, smp_wmb() is used after a route is successfully inserted into the fib tree and right before the updated of fn->sernum. And smp_rmb() is used right after fn->sernum is accessed in rt6_get_cookie_safe(). This is to guarantee that when the reader thread sees the new fn->sernum, the new route is already inserted in the tree in memory. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ net/ipv6/ip6_fib.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 18 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d0b7283073e3..6bf929b50951 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -220,6 +220,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, if (fn) { *cookie = fn->fn_sernum; + /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ + smp_rmb(); status = true; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index eee392f7b1f6..f604b311cc3e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -585,7 +585,7 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required, int sernum, + int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; @@ -631,8 +631,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn->leaf = NULL; } - fn->fn_sernum = sernum; - return fn; } @@ -641,7 +639,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, */ /* Try to walk down on tree. */ - fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? fn->right : fn->left; @@ -677,7 +674,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, ln->fn_bit = plen; ln->parent = pn; - ln->fn_sernum = sernum; if (dir) pn->right = ln; @@ -737,8 +733,6 @@ insert_above: in->leaf = fn->leaf; atomic_inc(&in->leaf->rt6i_ref); - in->fn_sernum = sernum; - /* update parent pointer */ if (dir) pn->right = in; @@ -750,8 +744,6 @@ insert_above: ln->parent = in; fn->parent = in; - ln->fn_sernum = sernum; - if (addr_bit_set(addr, bit)) { in->right = ln; in->left = fn; @@ -776,8 +768,6 @@ insert_above: ln->parent = pn; - ln->fn_sernum = sernum; - if (dir) pn->right = ln; else @@ -1079,6 +1069,20 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } +static void fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) +{ + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + + /* paired with smp_rmb() in rt6_get_cookie_safe() */ + smp_wmb(); + while (fn) { + fn->fn_sernum = sernum; + fn = fn->parent; + } +} + /* * Add routing information to the routing tree. * / @@ -1111,7 +1115,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum, extack); + replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1145,15 +1149,13 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -1172,8 +1174,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); @@ -1190,8 +1191,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, #endif err = fib6_add_rt2node(fn, rt, info, mxc); - if (!err) + if (!err) { + fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); + } out: if (err) { -- cgit v1.2.3 From 8d1040e808bb2a5aeb4f0791b32bc7356a01ab11 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:08 -0700 Subject: ipv6: check fn->leaf before it is used If rwlock is replaced with rcu and spinlock, it is possible that the reader thread will see fn->leaf as NULL in the following scenarios: 1. fib6_add() is in progress and we have already inserted a new node but not yet inserted the route. 2. fib6_del_route() is in progress and we have already set fn->leaf to NULL but not yet freed the node because of rcu grace period. This patch makes sure all the reader threads check fn->leaf first before using it. And together with later patch to grab rcu_read_lock() and rcu_dereference() fn->leaf, it makes sure reader threads are safe when accessing fn->leaf. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 23 ++++++++++++++++++----- net/ipv6/route.c | 20 ++++++++++++-------- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f604b311cc3e..cf6137e81408 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1279,10 +1279,13 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, while (fn) { if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = fn->leaf; struct rt6key *key; - key = (struct rt6key *) ((u8 *) fn->leaf + - args->offset); + if (!leaf) + goto backtrack; + + key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES @@ -1299,9 +1302,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, return fn; } } -#ifdef CONFIG_IPV6_SUBTREES backtrack: -#endif if (fn->fn_flags & RTN_ROOT) break; @@ -1358,7 +1359,18 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = fn->leaf; + struct rt6key *key; + + /* This node is being deleted */ + if (!leaf) { + if (plen <= fn->fn_bit) + goto out; + else + goto next; + } + + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -1372,6 +1384,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, prev = fn; +next: /* * We have more bits to go */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aeb349aea429..05dc450af441 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -712,6 +712,7 @@ out: } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *leaf, struct rt6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) @@ -730,7 +731,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -748,17 +749,21 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, return match; } -static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, + int oif, int strict) { + struct rt6_info *leaf = fn->leaf; struct rt6_info *match, *rt0; - struct net *net; bool do_rr = false; + if (!leaf) + return net->ipv6.ip6_null_entry; + rt0 = fn->rr_ptr; if (!rt0) - fn->rr_ptr = rt0 = fn->leaf; + fn->rr_ptr = rt0 = leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); if (do_rr) { @@ -766,13 +771,12 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) - next = fn->leaf; + next = leaf; if (next != rt0) fn->rr_ptr = next; } - net = dev_net(rt0->dst.dev); return match ? match : net->ipv6.ip6_null_entry; } @@ -1623,7 +1627,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(fn, oif, strict); + rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(rt, fl6, oif, strict); if (rt == net->ipv6.ip6_null_entry) { -- cgit v1.2.3 From 17ecf590b3cba19d9ecb410e340aa78128382abb Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:09 -0700 Subject: ipv6: add key length check into rt6_select() After rwlock is replaced with rcu and spinlock, fib6_lookup() could potentially return an intermediate node if other thread is doing fib6_del() on a route which is the only route on the node so that fib6_repair_tree() will be called on this node and potentially assigns fn->leaf to the its child's fn->leaf. In order to detect this situation in rt6_select(), we have to check if fn->fn_bit is consistent with the key length stored in the route. And depending on if the fn is in the subtree or not, the key is either rt->rt6i_dst or rt->rt6i_src. If any inconsistency is found, that means the node no longer holds valid routes in it. So net->ipv6.ip6_null_entry is returned. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 05dc450af441..24b80f43bbfb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -755,6 +755,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, struct rt6_info *leaf = fn->leaf; struct rt6_info *match, *rt0; bool do_rr = false; + int key_plen; if (!leaf) return net->ipv6.ip6_null_entry; @@ -763,6 +764,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, if (!rt0) fn->rr_ptr = rt0 = leaf; + /* Double check to make sure fn is not an intermediate node + * and fn->leaf does not points to its child's leaf + * (This might happen if all routes under fn are deleted from + * the tree and fib6_repair_tree() is called on the node.) + */ + key_plen = rt0->rt6i_dst.plen; +#ifdef CONFIG_IPV6_SUBTREES + if (rt0->rt6i_src.plen) + key_plen = rt0->rt6i_src.plen; +#endif + if (fn->fn_bit != key_plen) + return net->ipv6.ip6_null_entry; + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); -- cgit v1.2.3 From 66f5d6ce53e665477d2a33e8f539d4fa4ca81c83 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:10 -0700 Subject: ipv6: replace rwlock with rcu and spinlock in fib6_table With all the preparation work before, we are now ready to replace rwlock with rcu and spinlock in fib6_table. That means now all fib6_node in fib6_table are protected by rcu. And when freeing fib6_node, call_rcu() is used to wait for the rcu grace period before releasing the memory. When accessing fib6_node, corresponding rcu APIs need to be used. And all previous sessions protected by the write lock will now be protected by the spin lock per table. All previous sessions protected by read lock will now be protected by rcu_read_lock(). A couple of things to note here: 1. As part of the work of replacing rwlock with rcu, the linked list of fn->leaf now has to be rcu protected as well. So both fn->leaf and rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are used when manipulating them. 2. For fn->rr_ptr, first of all, it also needs to be rcu protected now and is tagged with __rcu and rcu APIs are used in corresponding places. Secondly, fn->rr_ptr is changed in rt6_select() which is a reader thread. This makes the issue a bit complicated. We think a valid solution for it is to let rt6_select() grab the tb6_lock if it decides to change it. As it is not in the normal operation and only happens when there is no valid neighbor cache for the route, we think the performance impact should be low. 3. fib6_walk_continue() has to be called with tb6_lock held even in the route dumping related functions, e.g. inet6_dump_fib(), fib6_tables_dump() and ipv6_route_seq_ops. It is because fib6_walk_continue() makes modifications to the walker structure, and so are fib6_repair_tree() and fib6_del_route(). In order to do proper syncing between them, we need to let fib6_walk_continue() hold the lock. We may be able to do further improvement on the way we do the tree walk to get rid of the need for holding the spin lock. But not for now. 4. When fib6_del_route() removes a route from the tree, we no longer mark rt->dst.rt6_next to NULL to make simultaneous reader be able to further traverse the list with rcu. However, rt->dst.rt6_next is only valid within this same rcu period. No one should access it later. 5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be performed before we publish this route (either by linking it to fn->leaf or insert it in the list pointed by fn->leaf) just to be safe because as soon as we publish the route, some read thread will be able to access it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- include/net/ip6_fib.h | 24 ++- net/ipv6/addrconf.c | 11 +- net/ipv6/ip6_fib.c | 405 ++++++++++++++++++++++++++++++-------------------- net/ipv6/route.c | 121 ++++++++------- 5 files changed, 333 insertions(+), 230 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/dst.h b/include/net/dst.h index 06a6765da074..204c19e25456 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,7 @@ struct dst_entry { union { struct dst_entry *next; struct rtable __rcu *rt_next; - struct rt6_info *rt6_next; + struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6bf929b50951..0b438b9bcb10 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -68,18 +68,18 @@ struct fib6_config { }; struct fib6_node { - struct fib6_node *parent; - struct fib6_node *left; - struct fib6_node *right; + struct fib6_node __rcu *parent; + struct fib6_node __rcu *left; + struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES - struct fib6_node *subtree; + struct fib6_node __rcu *subtree; #endif - struct rt6_info *leaf; + struct rt6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info *rr_ptr; + struct rt6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -91,7 +91,7 @@ struct fib6_gc_args { #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else -#define FIB6_SUBTREE(fn) ((fn)->subtree) +#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif struct mx6_config { @@ -174,6 +174,14 @@ struct rt6_info { unused:7; }; +#define for_each_fib6_node_rt_rcu(fn) \ + for (rt = rcu_dereference((fn)->leaf); rt; \ + rt = rcu_dereference(rt->dst.rt6_next)) + +#define for_each_fib6_walker_rt(w) \ + for (rt = (w)->leaf; rt; \ + rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; @@ -310,7 +318,7 @@ struct rt6_statistics { struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; - rwlock_t tb6_lock; + spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f86e931d555e..9854d93e45bb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2321,12 +2321,12 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) @@ -2338,7 +2338,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) spin_lock(&ifa->lock); if (ifa->rt) { struct rt6_info *rt = ifa->rt; - struct fib6_table *table = rt->rt6i_table; int cpu; - read_lock(&table->tb6_lock); + rcu_read_lock(); addrconf_set_nopolicy(ifa->rt, val); if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { @@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) addrconf_set_nopolicy(*rtp, val); } } - read_unlock(&table->tb6_lock); + rcu_read_unlock(); } spin_unlock(&ifa->lock); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cf6137e81408..3f95908b39c3 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -54,8 +54,12 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); -static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); @@ -107,12 +111,12 @@ void fib6_update_sernum(struct rt6_info *rt) struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); } /* @@ -207,8 +211,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -227,7 +230,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; - table->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(table->tb6_root.leaf, + net->ipv6.ip6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -324,11 +328,8 @@ unsigned int fib6_tables_seq_read(struct net *net) struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - read_lock_bh(&tb->tb6_lock); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; - read_unlock_bh(&tb->tb6_lock); - } } rcu_read_unlock(); @@ -374,7 +375,7 @@ static int fib6_node_dump(struct fib6_walker *w) { struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) + for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); w->leaf = NULL; return 0; @@ -384,9 +385,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { w->root = &tb->tb6_root; - read_lock_bh(&tb->tb6_lock); + spin_lock_bh(&tb->tb6_lock); fib6_walk(net, w); - read_unlock_bh(&tb->tb6_lock); + spin_unlock_bh(&tb->tb6_lock); } /* Called with rcu_read_lock() */ @@ -423,7 +424,7 @@ static int fib6_dump_node(struct fib6_walker *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -482,9 +483,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->count = 0; w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; @@ -499,9 +500,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; @@ -582,11 +583,12 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_node *root, - struct in6_addr *addr, int plen, - int offset, int allow_create, - int replace_required, - struct netlink_ext_ack *extack) +static struct fib6_node *fib6_add_1(struct fib6_table *table, + struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -601,7 +603,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn = root; do { - key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -627,8 +631,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { - rt6_release(fn->leaf); - fn->leaf = NULL; + RCU_INIT_POINTER(fn->leaf, NULL); + rt6_release(leaf); } return fn; @@ -641,7 +645,11 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right : fn->left; + fn = dir ? + rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)) : + rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { @@ -672,13 +680,12 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - - ln->parent = pn; + RCU_INIT_POINTER(ln->parent, pn); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); return ln; @@ -692,7 +699,8 @@ insert_above: * and the current */ - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. @@ -729,27 +737,28 @@ insert_above: in->fn_bit = bit; - in->parent = pn; + RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&in->leaf->rt6i_ref); + atomic_inc(&rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))->rt6i_ref); /* update parent pointer */ if (dir) - pn->right = in; + rcu_assign_pointer(pn->right, in); else - pn->left = in; + rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; - ln->parent = in; - fn->parent = in; + RCU_INIT_POINTER(ln->parent, in); + rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; + rcu_assign_pointer(in->right, ln); + rcu_assign_pointer(in->left, fn); } else { - in->left = ln; - in->right = fn; + rcu_assign_pointer(in->left, ln); + rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ @@ -766,19 +775,19 @@ insert_above: ln->fn_bit = plen; - ln->parent = pn; - - if (dir) - pn->right = ln; - else - pn->left = ln; + RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) - ln->right = fn; + RCU_INIT_POINTER(ln->right, fn); else - ln->left = fn; + RCU_INIT_POINTER(ln->left, fn); + + rcu_assign_pointer(fn->parent, ln); - fn->parent = ln; + if (dir) + rcu_assign_pointer(pn->right, ln); + else + rcu_assign_pointer(pn->left, ln); } return ln; } @@ -824,6 +833,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { + struct fib6_table *table = rt->rt6i_table; + if (atomic_read(&rt->rt6i_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -832,12 +843,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_leaf; + if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { + new_leaf = fib6_find_prefix(net, table, fn); + atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); rt6_release(rt); } - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); } } } @@ -849,9 +865,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct rt6_info *iter = NULL; - struct rt6_info **ins; - struct rt6_info **fallback_ins = NULL; + struct rt6_info __rcu **ins; + struct rt6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -866,7 +884,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; - for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + for (iter = leaf; iter; + iter = rcu_dereference_protected(iter->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates */ @@ -928,7 +948,8 @@ next_iter: if (fallback_ins && !found) { /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); found++; } @@ -942,7 +963,7 @@ next_iter: struct rt6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ - sibling = fn->leaf; + sibling = leaf; while (sibling) { if (sibling->rt6i_metric == rt->rt6i_metric && rt6_qualify_for_ecmp(sibling)) { @@ -950,7 +971,8 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = sibling->dst.rt6_next; + sibling = rcu_dereference_protected(sibling->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -979,10 +1001,10 @@ add: if (err) return err; - rt->dst.rt6_next = iter; - *ins = rt; - rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(rt->dst.rt6_next, iter); atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt); if (!info->skip_notify) @@ -1008,10 +1030,10 @@ add: if (err) return err; - *ins = rt; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; - atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt); if (!info->skip_notify) @@ -1023,14 +1045,15 @@ add: nsiblings = iter->rt6i_nsiblings; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->dst.rt6_next; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; @@ -1038,14 +1061,15 @@ add: *ins = iter->dst.rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { ins = &iter->dst.rt6_next; } - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1079,7 +1103,8 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, smp_wmb(); while (fn) { fn->fn_sernum = sernum; - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } } @@ -1087,12 +1112,14 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, * Add routing information to the routing tree. * / * with source addr info in sub-trees + * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc, struct netlink_ext_ack *extack) { + struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1113,7 +1140,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + fn = fib6_add_1(table, root, + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { @@ -1128,7 +1156,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (!fn->subtree) { + if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* @@ -1146,13 +1174,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!sfn) goto failure; - sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + rcu_assign_pointer(sfn->leaf, + info->nl_net->ipv6.ip6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1168,10 +1197,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } /* Now link new subtree to main tree */ - sfn->parent = fn; - fn->subtree = sfn; + rcu_assign_pointer(sfn->parent, fn); + rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1182,9 +1211,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } } - if (!fn->leaf) { - fn->leaf = rt; + if (!rcu_access_pointer(fn->leaf)) { atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); } fn = sn; } @@ -1203,19 +1232,23 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - if (pn != fn && pn->leaf == rt) { - pn->leaf = NULL; + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn != fn && pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); atomic_dec(&rt->rt6i_ref); } - if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(info->nl_net, pn); + if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 - if (!pn->leaf) { - WARN_ON(pn->leaf == NULL); - pn->leaf = info->nl_net->ipv6.ip6_null_entry; + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = info->nl_net->ipv6.ip6_null_entry; } #endif - atomic_inc(&pn->leaf->rt6i_ref); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); } #endif goto failure; @@ -1230,7 +1263,7 @@ failure: * fn->leaf. */ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(info->nl_net, fn); + fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ @@ -1268,7 +1301,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, dir = addr_bit_set(args->addr, fn->fn_bit); - next = dir ? fn->right : fn->left; + next = dir ? rcu_dereference(fn->right) : + rcu_dereference(fn->left); if (next) { fn = next; @@ -1278,8 +1312,10 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, } while (fn) { - if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { - struct rt6_info *leaf = fn->leaf; + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) @@ -1289,10 +1325,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) { + if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(fn->subtree, - args + 1); + sfn = fib6_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1306,12 +1341,14 @@ backtrack: if (fn->fn_flags & RTN_ROOT) break; - fn = fn->parent; + fn = rcu_dereference(fn->parent); } return NULL; } +/* called with rcu_read_lock() held + */ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1359,7 +1396,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6_info *leaf = fn->leaf; + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ @@ -1389,9 +1426,9 @@ next: * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) - fn = fn->right; + fn = rcu_dereference(fn->right); else - fn = fn->left; + fn = rcu_dereference(fn->left); } out: if (exact_match) @@ -1413,9 +1450,11 @@ struct fib6_node *fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + WARN_ON(saddr == NULL); - if (fn && fn->subtree) - fn = fib6_locate_1(fn->subtree, saddr, src_len, + if (fn && subtree) + fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src), exact_match); } @@ -1433,16 +1472,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn) { + struct fib6_node *child_left, *child_right; + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; while (fn) { - if (fn->left) - return fn->left->leaf; - if (fn->right) - return fn->right->leaf; + child_left = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + child_right = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + if (child_left) + return rcu_dereference_protected(child_left->leaf, + lockdep_is_held(&table->tb6_lock)); + if (child_right) + return rcu_dereference_protected(child_right->leaf, + lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } @@ -1452,31 +1501,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. + * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, - struct fib6_node *fn) + struct fib6_table *table, + struct fib6_node *fn) { int children; int nstate; - struct fib6_node *child, *pn; + struct fib6_node *child; struct fib6_walker *w; int iter = 0; for (;;) { + struct fib6_node *fn_r = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *fn_l = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_r = rcu_dereference_protected(pn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_l = rcu_dereference_protected(pn->left, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_fn_leaf; + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf); + WARN_ON(fn_leaf); children = 0; child = NULL; - if (fn->right) - child = fn->right, children |= 1; - if (fn->left) - child = fn->left, children |= 2; + if (fn_r) + child = fn_r, children |= 1; + if (fn_l) + child = fn_l, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1484,36 +1551,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net, || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(net, fn); + new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 - if (!fn->leaf) { - WARN_ON(!fn->leaf); - fn->leaf = net->ipv6.ip6_null_entry; + if (!new_fn_leaf) { + WARN_ON(!new_fn_leaf); + new_fn_leaf = net->ipv6.ip6_null_entry; } #endif - atomic_inc(&fn->leaf->rt6i_ref); - return fn->parent; + atomic_inc(&new_fn_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_fn_leaf); + return pn; } - pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); - FIB6_SUBTREE(pn) = NULL; + RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) - pn->right = child; - else if (pn->left == fn) - pn->left = child; + if (pn_r == fn) + rcu_assign_pointer(pn->right, child); + else if (pn_l == fn) + rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) - child->parent = pn; + rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } @@ -1546,17 +1613,18 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; - rt6_release(pn->leaf); - pn->leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + rt6_release(pn_leaf); fn = pn; } } -static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nl_info *info) +static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, + struct rt6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = *rtp; + struct rt6_info *rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); @@ -1573,7 +1641,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_flush_exceptions(rt); /* Reset round-robin state, if necessary */ - if (fn->rr_ptr == rt) + if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ @@ -1592,20 +1660,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->dst.rt6_next; + w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); - rt->dst.rt6_next = NULL; - /* If it was last route, expunge its radix tree node */ - if (!fn->leaf) { + if (!rcu_access_pointer(fn->leaf)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; - fn = fib6_repair_tree(net, fn); + fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); @@ -1616,12 +1683,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_release(rt); } +/* Need to own table->tb6_lock */ int fib6_del(struct rt6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info **rtp; + struct rt6_info __rcu **rtp; + struct rt6_info __rcu **rtp_next; #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { @@ -1642,11 +1712,14 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { - if (*rtp == rt) { - fib6_del_route(fn, rtp, info); + for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { + struct rt6_info *cur = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); + if (rt == cur) { + fib6_del_route(table, fn, rtp, info); return 0; } + rtp_next = &cur->dst.rt6_next; } return -ENOENT; } @@ -1673,11 +1746,13 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. + * + * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { - struct fib6_node *fn, *pn; + struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); @@ -1697,20 +1772,22 @@ static int fib6_walk_continue(struct fib6_walker *w) w->state = FWS_L; #endif case FWS_L: - if (fn->left) { - w->node = fn->left; + left = rcu_dereference_protected(fn->left, 1); + if (left) { + w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; case FWS_R: - if (fn->right) { - w->node = fn->right; + right = rcu_dereference_protected(fn->right, 1); + if (right) { + w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; - w->leaf = fn->leaf; + w->leaf = rcu_dereference_protected(fn->leaf, 1); case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1732,7 +1809,9 @@ skip: case FWS_U: if (fn == w->root) return 0; - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, 1); + left = rcu_dereference_protected(pn->left, 1); + right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { @@ -1741,13 +1820,13 @@ skip: continue; } #endif - if (pn->left == fn) { + if (left == fn) { w->state = FWS_R; continue; } - if (pn->right == fn) { + if (right == fn) { w->state = FWS_C; - w->leaf = w->node->leaf; + w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 @@ -1790,7 +1869,7 @@ static int fib6_clean_node(struct fib6_walker *w) return 0; } - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; @@ -1850,10 +1929,10 @@ static void __fib6_clean_all(struct net *net, for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); @@ -1967,7 +2046,8 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -1978,7 +2058,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); @@ -2108,7 +2189,9 @@ static int ipv6_route_yield(struct fib6_walker *w) return 1; do { - iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->w.leaf = rcu_dereference_protected( + iter->w.leaf->dst.rt6_next, + lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; @@ -2173,7 +2256,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = ((struct rt6_info *)v)->dst.rt6_next; + n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; @@ -2181,9 +2264,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter_table: ipv6_route_check_sernum(iter); - read_lock(&iter->tbl->tb6_lock); + spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); - read_unlock(&iter->tbl->tb6_lock); + spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { if (v) ++*pos; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 24b80f43bbfb..cf44d0994b1e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -488,7 +488,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, } /* - * Route lookup. Any table->tb6_lock is implied. + * Route lookup. rcu_read_lock() should be held. */ static inline struct rt6_info *rt6_device_match(struct net *net, @@ -503,7 +503,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -722,7 +722,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -731,7 +731,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; + rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -743,7 +744,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rt->dst.rt6_next) + for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -752,7 +753,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, int oif, int strict) { - struct rt6_info *leaf = fn->leaf; + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6_info *match, *rt0; bool do_rr = false; int key_plen; @@ -760,9 +761,9 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, if (!leaf) return net->ipv6.ip6_null_entry; - rt0 = fn->rr_ptr; + rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) - fn->rr_ptr = rt0 = leaf; + rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf @@ -781,14 +782,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rt0->dst.rt6_next; + struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) next = leaf; - if (next != rt0) - fn->rr_ptr = next; + if (next != rt0) { + spin_lock_bh(&leaf->rt6i_table->tb6_lock); + /* make sure next is not being deleted from the tree */ + if (next->rt6i_node) + rcu_assign_pointer(fn->rr_ptr, next); + spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + } } return match ? match : net->ipv6.ip6_null_entry; @@ -878,13 +884,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { - struct fib6_node *pn; + struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; - pn = fn->parent; - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + pn = rcu_dereference(fn->parent); + sn = FIB6_SUBTREE(pn); + if (sn && sn != fn) + fn = fib6_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -916,13 +923,19 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = fn->leaf; - rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + rt = rcu_dereference(fn->leaf); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + } else { + rt = rt6_device_match(net, rt, &fl6->saddr, + fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, + fl6->flowi6_oif, flags); + } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -936,7 +949,7 @@ restart: if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -990,9 +1003,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, struct fib6_table *table; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, mxc, extack); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); return err; } @@ -1090,7 +1103,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) return pcpu_rt; } -/* It should be called with read_lock_bh(&tb6_lock) acquired */ +/* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1632,7 +1645,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1662,7 +1675,7 @@ redo_rt6_select: rt = rt_cache; if (rt == net->ipv6.ip6_null_entry) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -1671,7 +1684,7 @@ redo_rt6_select: dst_use_noref(&rt->dst, jiffies); rt6_dst_from_metrics_check(rt); } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1687,11 +1700,11 @@ redo_rt6_select: if (ip6_hold_safe(net, &rt, true)) { dst_use_noref(&rt->dst, jiffies); } else { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); uncached_rt = rt; goto uncached_rt_out; } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -1719,7 +1732,7 @@ uncached_rt_out: pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); } else { /* atomic_inc_not_zero() is needed when using rcu */ if (atomic_inc_not_zero(&rt->rt6i_ref)) { @@ -1730,12 +1743,12 @@ uncached_rt_out: * No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); pcpu_rt = rt6_make_pcpu_route(rt); rt6_release(rt); } else { /* rt is already removed from tree */ - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); pcpu_rt = net->ipv6.ip6_null_entry; dst_hold(&pcpu_rt->dst); } @@ -2131,10 +2144,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, * routes. */ - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -2179,7 +2192,7 @@ restart: out: ip6_hold_safe(net, &rt, true); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -2778,9 +2791,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) } table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out: ip6_rt_put(rt); @@ -2806,7 +2819,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.ip6_null_entry) goto out_put; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { struct rt6_info *sibling, *next_sibling; @@ -2836,7 +2849,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) err = fib6_del(rt, info); out_unlock: - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out_put: ip6_rt_put(rt); @@ -2861,7 +2874,7 @@ static int ip6_route_del(struct fib6_config *cfg, return err; } - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, @@ -2869,7 +2882,7 @@ static int ip6_route_del(struct fib6_config *cfg, !(cfg->fc_flags & RTF_CACHE)); if (fn) { - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (cfg->fc_flags & RTF_CACHE) { rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, &cfg->fc_src); @@ -2890,7 +2903,7 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!dst_hold_safe(&rt->dst)) break; - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) @@ -2899,7 +2912,7 @@ static int ip6_route_del(struct fib6_config *cfg, return __ip6_del_rt_siblings(rt, cfg); } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return err; } @@ -3074,12 +3087,12 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) @@ -3090,7 +3103,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3136,8 +3149,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) @@ -3145,7 +3158,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev } if (rt) ip6_hold_safe(NULL, &rt, false); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3183,20 +3196,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table) struct rt6_info *rt; restart: - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { if (dst_hold_safe(&rt->dst)) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); ip6_del_rt(rt); } else { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); } goto restart; } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } -- cgit v1.2.3 From 81eb8447daae3b62247aa66bb17b82f8fef68249 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:11 -0700 Subject: ipv6: take care of rt6_stats Currently, most of the rt6_stats are not hooked up correctly. As the last part of this patch series, hook up all existing rt6_stats and add one new stat fib_rt_uncache to indicate the number of routes in the uncached list. For details of the stats, please refer to the comments added in include/net/ip6_fib.h. Note: fib_rt_alloc and fib_rt_uncache are not guaranteed to be modified under a lock. So atomic_t is used for them. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 15 +++++++++------ net/ipv6/ip6_fib.c | 42 ++++++++++++++++++++++++------------------ net/ipv6/route.c | 16 ++++++++++++++-- 3 files changed, 47 insertions(+), 26 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0b438b9bcb10..10c913816032 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -297,12 +297,15 @@ struct fib6_walker { }; struct rt6_statistics { - __u32 fib_nodes; - __u32 fib_route_nodes; - __u32 fib_rt_alloc; /* permanent routes */ - __u32 fib_rt_entries; /* rt entries in table */ - __u32 fib_rt_cache; /* cache routes */ - __u32 fib_discarded_routes; + __u32 fib_nodes; /* all fib6 nodes */ + __u32 fib_route_nodes; /* intermediate nodes */ + __u32 fib_rt_entries; /* rt entries in fib table */ + __u32 fib_rt_cache; /* cached rt entries in exception table */ + __u32 fib_discarded_routes; /* total number of routes delete */ + + /* The following stats are not protected by any lock */ + atomic_t fib_rt_alloc; /* total number of routes alloced */ + atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3f95908b39c3..52a29ba32928 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -149,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + if (fn) + net->ipv6.rt6_stats->fib_nodes++; return fn; } -static void node_free_immediate(struct fib6_node *fn) +static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); + net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) @@ -170,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head) kmem_cache_free(fib6_node_kmem, fn); } -static void node_free(struct fib6_node *fn) +static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); + net->ipv6.rt6_stats->fib_nodes--; } void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) @@ -583,7 +587,8 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_table *table, +static struct fib6_node *fib6_add_1(struct net *net, + struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, @@ -675,7 +680,7 @@ static struct fib6_node *fib6_add_1(struct fib6_table *table, * Create new leaf node without children. */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); @@ -716,14 +721,14 @@ insert_above: * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { - in = node_alloc(); - ln = node_alloc(); + in = node_alloc(net); + ln = node_alloc(net); if (!in || !ln) { if (in) - node_free_immediate(in); + node_free_immediate(net, in); if (ln) - node_free_immediate(ln); + node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } @@ -768,7 +773,7 @@ insert_above: * (old node)[fn] NULL */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); @@ -1065,6 +1070,7 @@ add: fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->dst.rt6_next; } @@ -1140,7 +1146,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(table, root, + fn = fib6_add_1(info->nl_net, table, root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, extack); @@ -1170,7 +1176,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, */ /* Create subtree root node */ - sfn = node_alloc(); + sfn = node_alloc(info->nl_net); if (!sfn) goto failure; @@ -1181,8 +1187,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, sfn, + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1191,7 +1197,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, root, and then (in failure) stale node in main tree. */ - node_free_immediate(sfn); + node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } @@ -1200,8 +1206,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1609,7 +1615,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } read_unlock(&net->ipv6.fib6_walker_lock); - node_free(fn); + node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cf44d0994b1e..399d1bceec4a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -143,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; + struct net *net = dev_net(rt->dst.dev); spin_lock_bh(&ul->lock); list_del(&rt->rt6i_uncached); + atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); spin_unlock_bh(&ul->lock); } } @@ -359,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) + if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); + } return rt; } @@ -1156,6 +1160,8 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { + struct net *net = dev_net(rt6_ex->rt6i->dst.dev); + if (!bucket || !rt6_ex) return; rt6_ex->rt6i->rt6i_node = NULL; @@ -1164,6 +1170,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; + net->ipv6.rt6_stats->fib_rt_cache--; } /* Remove oldest rt6_ex in bucket and free the memory @@ -1270,6 +1277,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { + struct net *net = dev_net(ort->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1339,6 +1347,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, nrt->rt6i_node = ort->rt6i_node; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; + net->ipv6.rt6_stats->fib_rt_cache++; if (bucket->depth > FIB6_MAX_DEPTH) rt6_exception_remove_oldest(bucket); @@ -1714,6 +1723,7 @@ redo_rt6_select: * No need for another dst_hold() */ rt6_uncached_list_add(uncached_rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); } else { uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); @@ -1894,6 +1904,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori DST_OBSOLETE_NONE, 0); if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; @@ -2341,6 +2352,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, * do proper release of the net_device */ rt6_uncached_list_add(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -4422,7 +4434,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, - net->ipv6.rt6_stats->fib_rt_alloc, + atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), -- cgit v1.2.3 From 951f788a80ff8b6339c5c1ab888b0d4b4352efd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Oct 2017 21:07:18 -0700 Subject: ipv6: fix a BUG in rt6_get_pcpu_route() Ido reported following splat and provided a patch. [ 122.221814] BUG: using smp_processor_id() in preemptible [00000000] code: sshd/2672 [ 122.221845] caller is debug_smp_processor_id+0x17/0x20 [ 122.221866] CPU: 0 PID: 2672 Comm: sshd Not tainted 4.14.0-rc3-idosch-next-custom #639 [ 122.221880] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016 [ 122.221893] Call Trace: [ 122.221919] dump_stack+0xb1/0x10c [ 122.221946] ? _atomic_dec_and_lock+0x124/0x124 [ 122.221974] ? ___ratelimit+0xfe/0x240 [ 122.222020] check_preemption_disabled+0x173/0x1b0 [ 122.222060] debug_smp_processor_id+0x17/0x20 [ 122.222083] ip6_pol_route+0x1482/0x24a0 ... I believe we can simplify this code path a bit, since we no longer hold a read_lock and need to release it to avoid a dead lock. By disabling BH, we make sure we'll prevent code re-entry and rt6_get_pcpu_route()/rt6_make_pcpu_route() run on the same cpu. Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Ido Schimmel Signed-off-by: Eric Dumazet Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/route.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 399d1bceec4a..606e80325b21 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1136,15 +1136,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) dst_hold(&pcpu_rt->dst); p = this_cpu_ptr(rt->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - /* release refcnt taken by ip6_rt_pcpu_alloc() */ - dst_release_immediate(&pcpu_rt->dst); - /* release refcnt taken by above dst_hold() */ - dst_release_immediate(&pcpu_rt->dst); - dst_hold(&prev->dst); - pcpu_rt = prev; - } + BUG_ON(prev); rt6_dst_from_metrics_check(pcpu_rt); return pcpu_rt; @@ -1739,31 +1731,25 @@ uncached_rt_out: struct rt6_info *pcpu_rt; dst_use_noref(&rt->dst, jiffies); + local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(rt); - if (pcpu_rt) { - rcu_read_unlock(); - } else { + if (!pcpu_rt) { /* atomic_inc_not_zero() is needed when using rcu */ if (atomic_inc_not_zero(&rt->rt6i_ref)) { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - * - * No dst_hold() on rt is needed because grabbing + /* No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - rcu_read_unlock(); pcpu_rt = rt6_make_pcpu_route(rt); rt6_release(rt); } else { /* rt is already removed from tree */ - rcu_read_unlock(); pcpu_rt = net->ipv6.ip6_null_entry; dst_hold(&pcpu_rt->dst); } } - + local_bh_enable(); + rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; } -- cgit v1.2.3 From 8ef802aa8ec87a0fec20d28f8f51cad964265902 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:23 -0700 Subject: ipv6: prepare RCU lookups for idev->addr_list inet6_ifa_finish_destroy() already uses kfree_rcu() to free inet6_ifaddr structs. We need to use proper list additions/deletions in order to allow readers to use RCU instead of idev->lock rwlock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9854d93e45bb..d1ff0955b709 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -945,7 +945,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) break; } - list_add_tail(&ifp->if_list, p); + list_add_tail_rcu(&ifp->if_list, p); } static u32 inet6_addr_hash(const struct in6_addr *addr) @@ -1204,7 +1204,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); - list_del_init(&ifp->if_list); + list_del_rcu(&ifp->if_list); __in6_ifa_put(ifp); write_unlock_bh(&ifp->idev->lock); @@ -3562,7 +3562,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; - struct list_head del_list; int _keep_addr; bool keep_addr; int state, i; @@ -3654,7 +3653,6 @@ restart: */ keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); - INIT_LIST_HEAD(&del_list); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { struct rt6_info *rt = NULL; bool keep; @@ -3663,8 +3661,6 @@ restart: keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) && !addr_is_local(&ifa->addr); - if (!keep) - list_move(&ifa->if_list, &del_list); write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); @@ -3698,19 +3694,14 @@ restart: } write_lock_bh(&idev->lock); + if (!keep) { + list_del_rcu(&ifa->if_list); + in6_ifa_put(ifa); + } } write_unlock_bh(&idev->lock); - /* now clean up addresses to be removed */ - while (!list_empty(&del_list)) { - ifa = list_first_entry(&del_list, - struct inet6_ifaddr, if_list); - list_del(&ifa->if_list); - - in6_ifa_put(ifa); - } - /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); -- cgit v1.2.3 From d9bf82c2f61be534873a37a0631b58761b6d6506 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:24 -0700 Subject: ipv6: ipv6_count_addresses() rcu conversion Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d1ff0955b709..2e029c8be1f2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -152,7 +152,7 @@ static void ipv6_regen_rndid(struct inet6_dev *idev); static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); -static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_count_addresses(const struct inet6_dev *idev); static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); @@ -1785,15 +1785,15 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } -static int ipv6_count_addresses(struct inet6_dev *idev) +static int ipv6_count_addresses(const struct inet6_dev *idev) { + const struct inet6_ifaddr *ifp; int cnt = 0; - struct inet6_ifaddr *ifp; - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) + rcu_read_lock(); + list_for_each_entry_rcu(ifp, &idev->addr_list, if_list) cnt++; - read_unlock_bh(&idev->lock); + rcu_read_unlock(); return cnt; } -- cgit v1.2.3 From 47e26941f78d6ec03c5dd547425409f1f7852ff5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:25 -0700 Subject: ipv6: ipv6_chk_custom_prefix() rcu conversion Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2e029c8be1f2..33ee84c2512b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1859,20 +1859,18 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; bool ret = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); if (ret) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); -- cgit v1.2.3 From 24ba333b2c1a609fc17214ceb215e19bd9d54951 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:26 -0700 Subject: ipv6: ipv6_chk_prefix() rcu conversion Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 33ee84c2512b..ea63442209bf 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1880,22 +1880,20 @@ EXPORT_SYMBOL(ipv6_chk_custom_prefix); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; int onlink; onlink = 0; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { onlink = ipv6_prefix_equal(addr, &ifa->addr, ifa->prefix_len); if (onlink) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); return onlink; -- cgit v1.2.3 From f59c031e9134bb16c38980196a73d7ba40979baa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:27 -0700 Subject: ipv6: __ipv6_dev_get_saddr() rcu conversion Callers hold rcu_read_lock(), so we do not need the rcu_read_lock()/rcu_read_unlock() pair. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ea63442209bf..20c3ca777529 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1558,8 +1558,7 @@ static int __ipv6_dev_get_saddr(struct net *net, { struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) { int i; /* @@ -1625,7 +1624,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } } out: - read_unlock_bh(&idev->lock); return hiscore_idx; } -- cgit v1.2.3 From cc429c8f6fa7cc1c9109bb4a1ad63a6d1f232f0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Oct 2017 19:30:28 -0700 Subject: ipv6: avoid cache line dirtying in ipv6_dev_get_saddr() By extending the rcu section a bit, we can avoid these very expensive in6_ifa_put()/in6_ifa_hold() calls done in __ipv6_dev_get_saddr() and ipv6_dev_get_saddr() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 20c3ca777529..cab3faad2bf1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1608,11 +1608,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } break; } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - swap(hiscore, score); hiscore_idx = 1 - hiscore_idx; @@ -1660,6 +1655,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, int dst_type; bool use_oif_addr = false; int hiscore_idx = 0; + int ret = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1735,15 +1731,14 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, } out: - rcu_read_unlock(); - hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) - return -EADDRNOTAVAIL; + ret = -EADDRNOTAVAIL; + else + *saddr = hiscore->ifa->addr; - *saddr = hiscore->ifa->addr; - in6_ifa_put(hiscore->ifa); - return 0; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ipv6_dev_get_saddr); -- cgit v1.2.3 From bfd8e5a407133e58a92a38ccf3d0ba6db81f22d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 06:01:37 -0700 Subject: ipv6: avoid zeroing per cpu data again per cpu allocations are already zeroed, no need to clear them again. Fixes: d52d3997f843f ("ipv6: Create percpu rt6_info") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Cc: Tejun Heo Acked-by: Tejun Heo Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 606e80325b21..3d7d4e09301e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -377,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, if (rt) { rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (rt->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **p; - - p = per_cpu_ptr(rt->rt6i_pcpu, cpu); - /* no one shares rt */ - *p = NULL; - } - } else { + if (!rt->rt6i_pcpu) { dst_release_immediate(&rt->dst); return NULL; } -- cgit v1.2.3 From 66c77ff3a049a1d182c9c78cc6286bb1ceee70db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 09:52:24 -0700 Subject: ipv6: addrlabel: rework ip6addrlbl_get() If we allocate skb before the lookup, we can use RCU without the need of ip6addrlbl_hold() This means that the following patch can get rid of refcounting. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index c6311d7108f6..f31489652c4a 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -546,38 +546,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; addr = nla_data(tb[IFAL_ADDRESS]); - rcu_read_lock(); - p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && !ip6addrlbl_hold(p)) - p = NULL; - lseq = net->ipv6.ip6addrlbl_table.seq; - rcu_read_unlock(); - - if (!p) { - err = -ESRCH; - goto out; - } - skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); - if (!skb) { - ip6addrlbl_put(p); + if (!skb) return -ENOBUFS; - } - err = ip6addrlbl_fill(skb, p, lseq, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWADDRLABEL, 0); + err = -ESRCH; - ip6addrlbl_put(p); + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); + lseq = net->ipv6.ip6addrlbl_table.seq; + if (p) + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + rcu_read_unlock(); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - goto out; + } else { + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } - - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); -out: return err; } -- cgit v1.2.3 From 2809c0957dbbd56d3a92e9ede27b19943fef7236 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 09:52:25 -0700 Subject: ipv6: addrlabel: remove refcounting After previous patch ("ipv6: addrlabel: rework ip6addrlbl_get()") we can remove the refcount from struct ip6addrlbl_entry, since it is no longer elevated in p6addrlbl_get() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index f31489652c4a..2606d2fa3ac4 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -18,7 +18,6 @@ #include #include #include -#include #if 0 #define ADDRLABEL(x...) printk(x) @@ -36,7 +35,6 @@ struct ip6addrlbl_entry { int addrtype; u32 label; struct hlist_node list; - refcount_t refcnt; struct rcu_head rcu; }; @@ -111,28 +109,6 @@ static const __net_initconst struct ip6addrlbl_init_table } }; -/* Object management */ -static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) -{ - kfree(p); -} - -static void ip6addrlbl_free_rcu(struct rcu_head *h) -{ - ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); -} - -static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p) -{ - return refcount_inc_not_zero(&p->refcnt); -} - -static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) -{ - if (refcount_dec_and_test(&p->refcnt)) - call_rcu(&p->rcu, ip6addrlbl_free_rcu); -} - /* Find label */ static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, @@ -219,7 +195,6 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - refcount_set(&newp->refcnt, 1); return newp; } @@ -243,7 +218,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, goto out; } hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); goto out; } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || (p->prefixlen < newp->prefixlen)) { @@ -281,7 +256,7 @@ static int ip6addrlbl_add(struct net *net, ret = __ip6addrlbl_add(net, newp, replace); spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) - ip6addrlbl_free(newp); + kfree(newp); return ret; } @@ -302,7 +277,7 @@ static int __ip6addrlbl_del(struct net *net, p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); ret = 0; break; } @@ -360,7 +335,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) spin_lock(&net->ipv6.ip6addrlbl_table.lock); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); } spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } -- cgit v1.2.3 From d0e60206bea2dec46c0a28fd6b116646aa67c5ae Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 9 Oct 2017 17:17:26 -0700 Subject: ipv6: use rcu_dereference_bh() in ipv6_route_seq_next() This patch replaces rcu_deference() with rcu_dereference_bh() in ipv6_route_seq_next() to avoid the following warning: [ 19.431685] WARNING: suspicious RCU usage [ 19.433451] 4.14.0-rc3-00914-g66f5d6c #118 Not tainted [ 19.435509] ----------------------------- [ 19.437267] net/ipv6/ip6_fib.c:2259 suspicious rcu_dereference_check() usage! [ 19.440790] [ 19.440790] other info that might help us debug this: [ 19.440790] [ 19.444734] [ 19.444734] rcu_scheduler_active = 2, debug_locks = 1 [ 19.447757] 2 locks held by odhcpd/3720: [ 19.449480] #0: (&p->lock){+.+.}, at: [] seq_read+0x3c/0x333 [ 19.452720] #1: (rcu_read_lock_bh){....}, at: [] ipv6_route_seq_start+0x5/0xfd [ 19.456323] [ 19.456323] stack backtrace: [ 19.458812] CPU: 0 PID: 3720 Comm: odhcpd Not tainted 4.14.0-rc3-00914-g66f5d6c #118 [ 19.462042] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 19.465414] Call Trace: [ 19.466788] dump_stack+0x86/0xc0 [ 19.468358] lockdep_rcu_suspicious+0xea/0xf3 [ 19.470183] ipv6_route_seq_next+0x71/0x164 [ 19.471963] seq_read+0x244/0x333 [ 19.473522] proc_reg_read+0x48/0x67 [ 19.475152] ? proc_reg_write+0x67/0x67 [ 19.476862] __vfs_read+0x26/0x10b [ 19.478463] ? __might_fault+0x37/0x84 [ 19.480148] vfs_read+0xba/0x146 [ 19.481690] SyS_read+0x51/0x8e [ 19.483197] do_int80_syscall_32+0x66/0x15a [ 19.484969] entry_INT80_compat+0x32/0x50 [ 19.486707] RIP: 0023:0xf7f0be8e [ 19.488244] RSP: 002b:00000000ffa75d04 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 19.491431] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 0000000008056068 [ 19.493886] RDX: 0000000000001000 RSI: 0000000008056008 RDI: 0000000000001000 [ 19.496331] RBP: 00000000000001ff R08: 0000000000000000 R09: 0000000000000000 [ 19.498768] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 19.501217] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Xiaolong Ye Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 52a29ba32928..c2ecd5ec638a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2262,7 +2262,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; -- cgit v1.2.3 From b2427e671766c51379bc882fdf809d905c6f9619 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2017 18:01:16 +0100 Subject: ipv6: fix dereference of rt6_ex before null check error Currently rt6_ex is being dereferenced before it is null checked hence there is a possible null dereference bug. Fix this by only dereferencing rt6_ex after it has been null checked. Detected by CoverityScan, CID#1457749 ("Dereference before null check") Fixes: 81eb8447daae ("ipv6: take care of rt6_stats") Signed-off-by: Colin Ian King Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 13b0d6d1cc30..c1c44f484286 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1142,10 +1142,12 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { - struct net *net = dev_net(rt6_ex->rt6i->dst.dev); + struct net *net; if (!bucket || !rt6_ex) return; + + net = dev_net(rt6_ex->rt6i->dst.dev); rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); rt6_release(rt6_ex->rt6i); -- cgit v1.2.3 From 442d713baa33db0f78adadee6125c215f10f5a75 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2017 19:10:30 +0100 Subject: ipv6: fix incorrect bitwise operator used on rt6i_flags The use of the | operator always leads to true which looks rather suspect to me. Fix this by using & instead to just check the RTF_CACHE entry bit. Detected by CoverityScan, CID#1457734, #1457747 ("Wrong operator used") Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache") Signed-off-by: Colin Ian King Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c1c44f484286..2e8842fa6450 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1415,7 +1415,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt) int err; if (!from || - !(rt->rt6i_flags | RTF_CACHE)) + !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; if (!rcu_access_pointer(from->rt6i_exception_bucket)) @@ -1459,7 +1459,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) struct rt6_exception *rt6_ex; if (!from || - !(rt->rt6i_flags | RTF_CACHE)) + !(rt->rt6i_flags & RTF_CACHE)) return; rcu_read_lock(); -- cgit v1.2.3 From 4ea2607f7871ca433ab0c5300289215974213f26 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:28:00 +0200 Subject: ipv6: addrconf: don't use rtnl mutex in RTM_GETNETCONF Instead of relying on rtnl mutex bump device reference count. After this change, values reported can change in parallel, but thats not much different from current state, as anyone can change the settings right after rtnl_unlock (and before userspace processed reply). While at it, switch to GFP_KERNEL allocation. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d9f6226694eb..5207f567ef28 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -616,23 +616,23 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; + struct inet6_dev *in6_dev = NULL; + struct net_device *dev = NULL; struct netconfmsg *ncm; struct sk_buff *skb; struct ipv6_devconf *devconf; - struct inet6_dev *in6_dev; - struct net_device *dev; int ifindex; int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, devconf_ipv6_policy, extack); if (err < 0) - goto errout; + return err; - err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) - goto errout; + return -EINVAL; + err = -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: @@ -642,10 +642,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, devconf = net->ipv6.devconf_dflt; break; default: - dev = __dev_get_by_index(net, ifindex); + dev = dev_get_by_index(net, ifindex); if (!dev) - goto errout; - in6_dev = __in6_dev_get(dev); + return -EINVAL; + in6_dev = in6_dev_get(dev); if (!in6_dev) goto errout; devconf = &in6_dev->cnf; @@ -653,7 +653,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; @@ -669,6 +669,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: + if (in6_dev) + in6_dev_put(in6_dev); + if (dev) + dev_put(dev); return err; } @@ -6570,7 +6574,7 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr, 0); __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, 0); + inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); ipv6_addr_label_rtnl_register(); -- cgit v1.2.3 From c24675f871d3bca569532b3744962dd49938ddff Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:28:01 +0200 Subject: ipv6: addrconf: don't use rtnl mutex in RTM_GETADDR Similar to the previous patch, use the device lookup functions that bump device refcount and flag this as DOIT_UNLOCKED to avoid rtnl mutex. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5207f567ef28..4603aa488f4f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4890,17 +4890,15 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); if (err < 0) - goto errout; + return err; addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) { - err = -EINVAL; - goto errout; - } + if (!addr) + return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = __dev_get_by_index(net, ifm->ifa_index); + dev = dev_get_by_index(net, ifm->ifa_index); ifa = ipv6_get_ifaddr(net, addr, dev, 1); if (!ifa) { @@ -4926,6 +4924,8 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, errout_ifa: in6_ifa_put(ifa); errout: + if (dev) + dev_put(dev); return err; } @@ -6568,7 +6568,7 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, 0); + inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr, 0); __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, -- cgit v1.2.3 From 0e80193bd8c1d97654f1a2f45934e7372e9a512e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 13 Oct 2017 15:01:08 -0700 Subject: ipv6: check fn before doing FIB6_SUBTREE(fn) In fib6_locate(), we need to first make sure fn is not NULL before doing FIB6_SUBTREE(fn) to avoid crash. This fixes the following static checker warning: net/ipv6/ip6_fib.c:1462 fib6_locate() warn: variable dereferenced before check 'fn' (see line 1459) net/ipv6/ip6_fib.c 1458 if (src_len) { 1459 struct fib6_node *subtree = FIB6_SUBTREE(fn); ^^^^^^^^^^^^^^^^ We shifted this dereference 1460 1461 WARN_ON(saddr == NULL); 1462 if (fn && subtree) ^^ before the check for NULL. 1463 fn = fib6_locate_1(subtree, saddr, src_len, 1464 offsetof(struct rt6_info, rt6i_src) Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Dan Carpenter Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c2ecd5ec638a..548af48212fc 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1456,13 +1456,16 @@ struct fib6_node *fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { - struct fib6_node *subtree = FIB6_SUBTREE(fn); - WARN_ON(saddr == NULL); - if (fn && subtree) - fn = fib6_locate_1(subtree, saddr, src_len, + if (fn) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree) { + fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src), exact_match); + } + } } #endif -- cgit v1.2.3 From 78802011fbe34331bdef6f2dfb1634011f0e4c32 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:20 -0700 Subject: inet: frags: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Alexander Aring Cc: Stefan Schmidt Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: linux-wpan@vger.kernel.org Cc: netdev@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Kees Cook Acked-by: Stefan Schmidt # for ieee802154 Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- net/ieee802154/6lowpan/reassembly.c | 5 +++-- net/ipv4/inet_fragment.c | 4 ++-- net/ipv4/ip_fragment.c | 5 +++-- net/ipv6/netfilter/nf_conntrack_reasm.c | 5 +++-- net/ipv6/reassembly.c | 5 +++-- 6 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index fc59e0775e00..c695807ca707 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -95,7 +95,7 @@ struct inet_frags { void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); - void (*frag_expire)(unsigned long data); + void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; }; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index f85b08baff16..85bf86ad6b18 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -80,12 +80,13 @@ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) fq->daddr = *arg->dst; } -static void lowpan_frag_expire(unsigned long data) +static void lowpan_frag_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags); spin_lock(&fq->q.lock); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..7f3ef5c287a1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) spin_unlock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); + f->frag_expire(&fq->timer); return evicted; } @@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, f->constructor(q, arg); add_frag_mem_limit(nf, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 1); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 46408c220d9d..9215654a401f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -190,12 +190,13 @@ static bool frag_expire_skip_icmp(u32 user) /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ -static void ip_expire(unsigned long arg) +static void ip_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct ipq *qp; struct net *net; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); rcu_read_lock(); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b263bf3a19f7..977d8900cfd1 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -169,12 +169,13 @@ static unsigned int nf_hashfn(const struct inet_frag_queue *q) return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); } -static void nf_ct_frag6_expire(unsigned long data) +static void nf_ct_frag6_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); ip6_expire_frag_queue(net, fq, &nf_frags); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 846012eae526..afbc000ad4f2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -170,12 +170,13 @@ out: } EXPORT_SYMBOL(ip6_expire_frag_queue); -static void ip6_frag_expire(unsigned long data) +static void ip6_frag_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); ip6_expire_frag_queue(net, fq, &ip6_frags); -- cgit v1.2.3 From 275757e6bae15a8621130907a78096afd1e15d2c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 16:36:52 -0500 Subject: ipv6: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in some cases I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv6/ah6.c | 1 + net/ipv6/exthdrs.c | 1 + net/ipv6/icmp.c | 6 ++---- net/ipv6/ip6_fib.c | 4 ++++ net/ipv6/ip6_tunnel.c | 1 + net/ipv6/ip6mr.c | 1 + net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 3 ++- net/ipv6/raw.c | 4 ++++ net/ipv6/tcp_ipv6.c | 3 ++- net/ipv6/xfrm6_policy.c | 1 + 10 files changed, 19 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 7802b72196f3..37bb33fbc742 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -271,6 +271,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) case NEXTHDR_DEST: if (dir == XFRM_POLICY_OUT) ipv6_rearrange_destopt(iph, exthdr.opth); + /* fall through */ case NEXTHDR_HOP: if (!zero_out_mutable_opts(exthdr.opth)) { net_dbg_ratelimited("overrun %sopts\n", diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 95516138e861..7835dea930b4 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -89,6 +89,7 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; + /* fall through */ case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); return false; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4e52d52a6752..6ae5dd3f4d0d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -864,10 +864,8 @@ static int icmpv6_rcv(struct sk_buff *skb) goto discard_it; hdr = icmp6_hdr(skb); - /* - * Drop through to notify - */ - + /* to notify */ + /* fall through */ case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 548af48212fc..1ada9672d198 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1780,6 +1780,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_L; #endif + /* fall through */ case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { @@ -1788,6 +1789,7 @@ static int fib6_walk_continue(struct fib6_walker *w) continue; } w->state = FWS_R; + /* fall through */ case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { @@ -1797,6 +1799,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); + /* fall through */ case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1815,6 +1818,7 @@ static int fib6_walk_continue(struct fib6_walker *w) } skip: w->state = FWS_U; + /* fall through */ case FWS_U: if (fn == w->root) return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 825548680b1e..4212879ff35e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -593,6 +593,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case NDISC_REDIRECT: rel_type = ICMP_REDIRECT; rel_code = ICMP_REDIR_HOST; + /* fall through */ default: return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f5500f5444e9..59fad81e5f7a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1722,6 +1722,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; + /* fall through */ case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 46d6dba50698..1d2fb9267d6f 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -290,7 +290,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e4462b0ff801..761a473a07c5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1055,6 +1055,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } @@ -1077,6 +1078,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); @@ -1138,6 +1140,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } @@ -1160,6 +1163,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 64d94afa427f..ae83615b7f6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1577,8 +1577,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 11d1314ab6c5..4ed9f8cc3b6a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -152,6 +152,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) switch (nexthdr) { case NEXTHDR_FRAGMENT: onlyproto = 1; + /* fall through */ case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: -- cgit v1.2.3 From f3d9832e56c48e4ca50bab0457e21bcaade4536d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:52 -0700 Subject: ipv6: addrconf: cleanup locking in ipv6_add_addr ipv6_add_addr is called in process context with rtnl lock held (e.g., manual config of an address) or during softirq processing (e.g., autoconf and address from a router advertisement). Currently, ipv6_add_addr calls rcu_read_lock_bh shortly after entry and does not call unlock until exit, minus the call around the address validator notifier. Similarly, addrconf_hash_lock is taken after the validator notifier and held until exit. This forces the allocation of inet6_ifaddr to always be atomic. Refactor ipv6_add_addr as follows: 1. add an input boolean to discriminate the call path (process context or softirq). This new flag controls whether the alloc can be done with GFP_KERNEL or GFP_ATOMIC. 2. Move the rcu_read_lock_bh and unlock calls only around functions that do rcu updates. 3. Remove the in6_dev_hold and put added by 3ad7d2468f79f ("Ipvlan should return an error when an address is already in use."). This was done presumably because rcu_read_unlock_bh needs to be called before calling the validator. Since rcu_read_lock is not needed before the validator runs revert the hold and put added by 3ad7d2468f79f and only do the hold when setting ifp->idev. 4. move duplicate address check and insertion of new address in the global address hash into a helper. The helper is called after an ifa is allocated and filled in. This allows the ifa for manually configured addresses to be done with GFP_KERNEL and reduces the overall amount of time with rcu_read_lock held and hash table spinlock held. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 104 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 44 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4603aa488f4f..a8d202b1b919 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -957,18 +957,43 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } +static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) +{ + unsigned int hash; + int err = 0; + + spin_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev)) { + ADBG("ipv6_add_addr: already assigned\n"); + err = -EEXIST; + goto out; + } + + /* Add to big hash table */ + hash = inet6_addr_hash(&ifa->addr); + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + +out: + spin_unlock(&addrconf_hash_lock); + + return err; +} + /* On success it returns ifp with increased reference count */ static struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, - int scope, u32 flags, u32 valid_lft, u32 prefered_lft) + int scope, u32 flags, u32 valid_lft, u32 prefered_lft, + bool can_block) { + gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct rt6_info *rt; + struct rt6_info *rt = NULL; struct in6_validator_info i6vi; - unsigned int hash; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -978,42 +1003,24 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); - rcu_read_lock_bh(); - - in6_dev_hold(idev); - if (idev->dead) { err = -ENODEV; /*XXX*/ - goto out2; + goto out; } if (idev->cnf.disable_ipv6) { err = -EACCES; - goto out2; + goto out; } i6vi.i6vi_addr = *addr; i6vi.i6vi_dev = idev; - rcu_read_unlock_bh(); - err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); - - rcu_read_lock_bh(); err = notifier_to_errno(err); - if (err) - goto out2; - - spin_lock(&addrconf_hash_lock); - - /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { - ADBG("ipv6_add_addr: already assigned\n"); - err = -EEXIST; + if (err < 0) goto out; - } - - ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; @@ -1023,6 +1030,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, rt = addrconf_dst_alloc(idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); + rt = NULL; goto out; } @@ -1053,16 +1061,21 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->rt = rt; ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ refcount_set(&ifa->refcnt, 1); - /* Add to big hash table */ - hash = inet6_addr_hash(addr); + rcu_read_lock_bh(); - hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); - spin_unlock(&addrconf_hash_lock); + err = ipv6_add_addr_hash(idev->dev, ifa); + if (err < 0) { + rcu_read_unlock_bh(); + goto out; + } write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ ipv6_link_dev_addr(idev, ifa); @@ -1073,21 +1086,23 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, in6_ifa_hold(ifa); write_unlock(&idev->lock); -out2: + rcu_read_unlock_bh(); - if (likely(err == 0)) - inet6addr_notifier_call_chain(NETDEV_UP, ifa); - else { - kfree(ifa); - in6_dev_put(idev); + inet6addr_notifier_call_chain(NETDEV_UP, ifa); +out: + if (unlikely(err < 0)) { + if (rt) + ip6_rt_put(rt); + if (ifa) { + if (ifa->idev) + in6_dev_put(ifa->idev); + kfree(ifa); + } ifa = ERR_PTR(err); } return ifa; -out: - spin_unlock(&addrconf_hash_lock); - goto out2; } enum cleanup_prefix_rt_t { @@ -1334,7 +1349,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft); + tmp_valid_lft, tmp_prefered_lft, true); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -2018,7 +2033,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, scope, flags, valid_lft, - preferred_lft); + preferred_lft, false); if (IS_ERR(ifp2)) goto lock_errdad; @@ -2476,7 +2491,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, addr_flags, valid_lft, - prefered_lft); + prefered_lft, false); if (IS_ERR_OR_NULL(ifp)) return -1; @@ -2845,7 +2860,7 @@ static int inet6_addr_add(struct net *net, int ifindex, } ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, - valid_lft, prefered_lft); + valid_lft, prefered_lft, true); if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -2960,7 +2975,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifp = ipv6_add_addr(idev, addr, NULL, plen, scope, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, + true); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -3060,7 +3076,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev, #endif ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp); -- cgit v1.2.3 From ff7883ea60e7b021bcd6539b8211879554c8db9a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:53 -0700 Subject: net: ipv6: Make inet6addr_validator a blocking notifier inet6addr_validator chain was added by commit 3ad7d2468f79f ("Ipvlan should return an error when an address is already in use") to allow address validation before changes are committed and to be able to fail the address change with an error back to the user. The address validation is not done for addresses received from router advertisements. Handling RAs in softirq context is the only reason for the notifier chain to be atomic versus blocking. Since the only current user, ipvlan, of the validator chain ignores softirq context, the notifier can be made blocking and simply not invoked for softirq path. The blocking option is needed by spectrum for example to validate resources for an adding an address to an interface. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 4 ---- net/ipv6/addrconf.c | 21 ++++++++++++++------- net/ipv6/addrconf_core.c | 9 +++++---- 3 files changed, 19 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 3cf67db513e2..6842739b6679 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -808,10 +808,6 @@ static int ipvlan_addr6_event(struct notifier_block *unused, struct net_device *dev = (struct net_device *)if6->idev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); - /* FIXME IPv6 autoconf calls us from bh without RTNL */ - if (in_softirq()) - return NOTIFY_DONE; - if (!netif_is_ipvlan(dev)) return NOTIFY_DONE; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a8d202b1b919..dd9c0c435f71 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -993,7 +993,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt = NULL; - struct in6_validator_info i6vi; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1013,12 +1012,20 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - i6vi.i6vi_addr = *addr; - i6vi.i6vi_dev = idev; - err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); - err = notifier_to_errno(err); - if (err < 0) - goto out; + /* validator notifier needs to be blocking; + * do not call in atomic context + */ + if (can_block) { + struct in6_validator_info i6vi = { + .i6vi_addr = *addr, + .i6vi_dev = idev, + }; + + err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); + err = notifier_to_errno(err); + if (err < 0) + goto out; + } ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 9e3488d50b15..32b564dfd02a 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -88,7 +88,7 @@ int __ipv6_addr_type(const struct in6_addr *addr) EXPORT_SYMBOL(__ipv6_addr_type); static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); -static ATOMIC_NOTIFIER_HEAD(inet6addr_validator_chain); +static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain); int register_inet6addr_notifier(struct notifier_block *nb) { @@ -110,19 +110,20 @@ EXPORT_SYMBOL(inet6addr_notifier_call_chain); int register_inet6addr_validator_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&inet6addr_validator_chain, nb); + return blocking_notifier_chain_register(&inet6addr_validator_chain, nb); } EXPORT_SYMBOL(register_inet6addr_validator_notifier); int unregister_inet6addr_validator_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&inet6addr_validator_chain, nb); + return blocking_notifier_chain_unregister(&inet6addr_validator_chain, + nb); } EXPORT_SYMBOL(unregister_inet6addr_validator_notifier); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v) { - return atomic_notifier_call_chain(&inet6addr_validator_chain, val, v); + return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v); } EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain); -- cgit v1.2.3 From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:54 -0700 Subject: net: Add extack to validator_info structs used for address notifier Add extack to in_validator_info and in6_validator_info. Update the one user of each, ipvlan, to return an error message for failures. Only manual configuration of an address is plumbed in the IPv6 code path. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 10 ++++++++-- include/linux/inetdevice.h | 1 + include/net/addrconf.h | 1 + net/ipv4/devinet.c | 8 +++++--- net/ipv6/addrconf.c | 22 ++++++++++++---------- 5 files changed, 27 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 6842739b6679..f0ab55df57f1 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -847,8 +847,11 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused, switch (event) { case NETDEV_UP: - if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) + if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { + NL_SET_ERR_MSG(i6vi->extack, + "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); + } break; } @@ -917,8 +920,11 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused, switch (event) { case NETDEV_UP: - if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) + if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { + NL_SET_ERR_MSG(ivi->extack, + "Address already assigned to an ipvlan device"); return notifier_from_errno(-EADDRINUSE); + } break; } diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 751d051f0bc7..681dff30940b 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -154,6 +154,7 @@ struct in_ifaddr { struct in_validator_info { __be32 ivi_addr; struct in_device *ivi_dev; + struct netlink_ext_ack *extack; }; int register_inetaddr_notifier(struct notifier_block *nb); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 87981cd63180..b8b16437c6d5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -55,6 +55,7 @@ struct prefix_info { struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; + struct netlink_ext_ack *extack; }; #define IN6_ADDR_HSIZE_SHIFT 4 diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e1e2ec0525e6..a4573bccd6da 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -444,7 +444,7 @@ static void check_lifetime(struct work_struct *work); static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -489,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; + ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); @@ -521,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { - return __inet_insert_ifa(ifa, NULL, 0); + return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) @@ -902,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return ret; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, + extack); } else { inet_free_ifa(ifa); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dd9c0c435f71..93f9c0a61911 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -987,7 +987,7 @@ static struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, int scope, u32 flags, u32 valid_lft, u32 prefered_lft, - bool can_block) + bool can_block, struct netlink_ext_ack *extack) { gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); @@ -1019,6 +1019,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, struct in6_validator_info i6vi = { .i6vi_addr = *addr, .i6vi_dev = idev, + .extack = extack, }; err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); @@ -1356,7 +1357,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft, true); + tmp_valid_lft, tmp_prefered_lft, true, NULL); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -2040,7 +2041,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, scope, flags, valid_lft, - preferred_lft, false); + preferred_lft, false, NULL); if (IS_ERR(ifp2)) goto lock_errdad; @@ -2498,7 +2499,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, addr_flags, valid_lft, - prefered_lft, false); + prefered_lft, false, NULL); if (IS_ERR_OR_NULL(ifp)) return -1; @@ -2808,7 +2809,8 @@ static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx, const struct in6_addr *peer_pfx, unsigned int plen, __u32 ifa_flags, - __u32 prefered_lft, __u32 valid_lft) + __u32 prefered_lft, __u32 valid_lft, + struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; @@ -2867,7 +2869,7 @@ static int inet6_addr_add(struct net *net, int ifindex, } ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, - valid_lft, prefered_lft, true); + valid_lft, prefered_lft, true, extack); if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -2952,7 +2954,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) rtnl_lock(); err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, ireq.ifr6_prefixlen, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL); rtnl_unlock(); return err; } @@ -2983,7 +2985,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifp = ipv6_add_addr(idev, addr, NULL, plen, scope, IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, - true); + true, NULL); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -3083,7 +3085,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev, #endif ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp); @@ -4586,7 +4588,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, */ return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, ifm->ifa_prefixlen, ifa_flags, - preferred_lft, valid_lft); + preferred_lft, valid_lft, extack); } if (nlh->nlmsg_flags & NLM_F_EXCL || -- cgit v1.2.3 From b886d5f2f2906a866e33212734d204dfe35d50d9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 16:07:10 +0200 Subject: ipv6: start fib6 gc on RTF_CACHE dst creation After the commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache"), the fib6 gc is not started after the creation of a RTF_CACHE via a redirect or pmtu update, since fib6_add() isn't invoked anymore for such dsts. We need the fib6 gc to run periodically to clean the RTF_CACHE, or the dst will stay there forever. Fix it by explicitly calling fib6_force_start_gc() on successful exception creation. gc_args->more accounting will ensure that the gc timer will run for whatever time needed to properly clean the table. v2 -> v3: - clarified the commit message Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Paolo Abeni Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2e8842fa6450..efecdcff5055 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1340,8 +1340,10 @@ out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ - if (!err) + if (!err) { fib6_update_sernum(ort); + fib6_force_start_gc(net); + } return err; } -- cgit v1.2.3 From 1859bac04fb696b858dbbff59503c945ec871bd9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 16:07:11 +0200 Subject: ipv6: remove from fib tree aged out RTF_CACHE dst The commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") partially reverted the commit 1e2ea8ad37be ("ipv6: set dst.obsolete when a cached route has expired"). As a result, RTF_CACHE dst referenced outside the fib tree will not be removed until the next sernum change; dst_check() does not fail on aged-out dst, and dst->__refcnt can't decrease: the aged out dst will stay valid for a potentially unlimited time after the timeout expiration. This change explicitly removes RTF_CACHE dst from the fib tree when aged out. The rt6_remove_exception() logic will then obsolete the dst and other entities will drop the related reference on next dst_check(). pMTU exceptions are not aged-out, and are removed from the exception table only when the - usually considerably longer - ip6_rt_mtu_expires timeout expires. v1 -> v2: - do not touch dst.obsolete in rt6_remove_exception(), not needed v2 -> v3: - take care of pMTU exceptions, too Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Paolo Abeni Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index efecdcff5055..074fac966018 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1575,7 +1575,13 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, { struct rt6_info *rt = rt6_ex->rt6i; - if (atomic_read(&rt->dst.__refcnt) == 1 && + /* we are pruning and obsoleting aged-out and non gateway exceptions + * even if others have still references to them, so that on next + * dst_check() such references can be dropped. + * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when + * expired, independently from their aging, as per RFC 8201 section 4 + */ + if (!(rt->rt6i_flags & RTF_EXPIRES) && time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); @@ -1595,6 +1601,10 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, rt6_remove_exception(bucket, rt6_ex); return; } + } else if (__rt6_check_expired(rt)) { + RT6_TRACE("purging expired route %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; } gc_args->more++; } -- cgit v1.2.3 From b65f164d37cf6d4aac59b0e13c2e5c4cfe293fd2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 09:31:43 +0200 Subject: ipv6: let trace_fib6_table_lookup() dereference the fib table The perf traces for ipv6 routing code show a relevant cost around trace_fib6_table_lookup(), even if no trace is enabled. This is due to the fib6_table de-referencing currently performed by the caller. Let's the tracing code pay this overhead, passing to the trace helper the table pointer. This gives small but measurable performance improvement under UDP flood. Signed-off-by: Paolo Abeni Acked-by: Steven Rostedt (VMware) Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 6 +++--- net/ipv6/route.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv6') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index d60096cddb2a..b34bed17abc7 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -12,9 +12,9 @@ TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct rt6_info *rt, - u32 tb_id, const struct flowi6 *flp), + struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, rt, tb_id, flp), + TP_ARGS(net, rt, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -34,7 +34,7 @@ TRACE_EVENT(fib6_table_lookup, TP_fast_assign( struct in6_addr *in6; - __entry->tb_id = tb_id; + __entry->tb_id = table->tb6_id; __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 074fac966018..46c59a53c53f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -945,7 +945,7 @@ restart: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; @@ -1682,7 +1682,7 @@ redo_rt6_select: if (rt == net->ipv6.ip6_null_entry) { rcu_read_unlock(); dst_hold(&rt->dst); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (rt->rt6i_flags & RTF_CACHE) { if (ip6_hold_safe(net, &rt, true)) { @@ -1690,7 +1690,7 @@ redo_rt6_select: rt6_dst_from_metrics_check(rt); } rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(rt->rt6i_flags & RTF_GATEWAY))) { @@ -1726,7 +1726,7 @@ redo_rt6_select: } uncached_rt_out: - trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; } else { @@ -1754,7 +1754,7 @@ uncached_rt_out: } local_bh_enable(); rcu_read_unlock(); - trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, pcpu_rt, table, fl6); return pcpu_rt; } } @@ -2195,7 +2195,7 @@ out: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; }; -- cgit v1.2.3 From d3cc547d9ccbe223b06e87256fdd571eb63762b7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 19 Oct 2017 15:09:47 +0200 Subject: esp6: remove redundant initialization of esph The pointer esph is being initialized with a value that is never read and then being updated. Remove the redundant initialization and move the declaration and initializtion of esph to the local code block. Cleans up clang warning: net/ipv6/esp6.c:562:21: warning: Value stored to 'esph' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 89910e2c10f4..1696401fed6c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -559,14 +559,14 @@ static void esp_input_restore_header(struct sk_buff *skb) static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) { struct xfrm_state *x = xfrm_input_state(skb); - struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * decryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { - esph = skb_push(skb, 4); + struct ip_esp_hdr *esph = skb_push(skb, 4); + *seqhi = esph->spi; esph->spi = esph->seq_no; esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; -- cgit v1.2.3 From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:24 -0700 Subject: tcp: add tracepoint trace_tcp_send_reset New tracepoint trace_tcp_send_reset is added and called from tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 11 +++++++++++ net/core/net-traces.c | 2 ++ net/ipv4/tcp_ipv4.c | 6 +++++- net/ipv4/tcp_output.c | 5 +++++ net/ipv6/tcp_ipv6.c | 10 ++++++++-- 5 files changed, 31 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 2b6fe72c6781..3e57e1ae1c6b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -77,6 +77,17 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_ARGS(sk, skb) ); +/* + * skb of trace_tcp_send_reset is the skb that caused RST. In case of + * active reset, skb should be NULL + */ +DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ diff --git a/net/core/net-traces.c b/net/core/net-traces.c index f4e4fa2db505..8dcd9b0be04a 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -49,3 +49,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); + +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e22439f05e46..eb3f3b8e1e4b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include #include +#include + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 988733f289c8..1f01f4c9c738 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3084,6 +3084,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + /* skb of trace_tcp_send_reset() keeps the skb that caused RST, + * skb here is different to the troublesome skb, so use NULL + */ + trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ae83615b7f6d..0e2529958b52 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -69,6 +69,8 @@ #include #include +#include + static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -890,7 +892,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) int genhash; struct sock *sk1 = NULL; #endif - int oif; + int oif = 0; if (th->rst) return; @@ -939,7 +941,11 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - oif = sk ? sk->sk_bound_dev_if : 0; + if (sk) { + oif = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 56fc709b7a9fe191173dc772a881e180458db517 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:45 -0700 Subject: ipv6: addrconf: move ipv6_chk_same_addr() to avoid forward declaration ipv6_chk_same_addr() is only used by ipv6_add_addr_hash(), so moving it avoids a forward declaration. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 93f9c0a61911..9228030e3497 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -192,8 +192,6 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo); -static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev); static struct ipv6_devconf ipv6_devconf __read_mostly = { .forwarding = 0, @@ -957,6 +955,23 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) +{ + unsigned int hash = inet6_addr_hash(addr); + struct inet6_ifaddr *ifp; + + hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (!dev || ifp->idev->dev == dev) + return true; + } + } + return false; +} + static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { unsigned int hash; @@ -1856,22 +1871,6 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, } EXPORT_SYMBOL(ipv6_chk_addr_and_flags); -static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev) -{ - unsigned int hash = inet6_addr_hash(addr); - struct inet6_ifaddr *ifp; - - hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { - if (!net_eq(dev_net(ifp->idev->dev), net)) - continue; - if (ipv6_addr_equal(&ifp->addr, addr)) { - if (!dev || ifp->idev->dev == dev) - return true; - } - } - return false; -} /* Compares an address/prefix_len with addresses on device @dev. * If one is found it returns true. -- cgit v1.2.3 From 752a92927e97e88096394dac3f10d12a58555254 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:46 -0700 Subject: ipv6: addrconf: factorize inet6_addr_hash() call ipv6_add_addr_hash() can compute the hash value outside of locked section and pass it to ipv6_chk_same_addr(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9228030e3497..c1a5028f394c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -956,9 +956,8 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) } static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev) + struct net_device *dev, unsigned int hash) { - unsigned int hash = inet6_addr_hash(addr); struct inet6_ifaddr *ifp; hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { @@ -974,23 +973,19 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { - unsigned int hash; + unsigned int hash = inet6_addr_hash(&ifa->addr); int err = 0; spin_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev)) { + if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) { ADBG("ipv6_add_addr: already assigned\n"); err = -EEXIST; - goto out; + } else { + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); } - /* Add to big hash table */ - hash = inet6_addr_hash(&ifa->addr); - hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); - -out: spin_unlock(&addrconf_hash_lock); return err; -- cgit v1.2.3 From 3f27fb23219e75343b094366f2358bff34300493 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:47 -0700 Subject: ipv6: addrconf: add per netns perturbation in inet6_addr_hash() Bring IPv6 in par with IPv4 : - Use net_hash_mix() to spread addresses a bit more. - Use 256 slots hash table instead of 16 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- net/ipv6/addrconf.c | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index b8b16437c6d5..15b5ffd7253d 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -58,7 +58,7 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; -#define IN6_ADDR_HSIZE_SHIFT 4 +#define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) int addrconf_init(void); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c1a5028f394c..d70d98122053 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -950,9 +950,11 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_add_tail_rcu(&ifp->if_list, p); } -static u32 inet6_addr_hash(const struct in6_addr *addr) +static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr) { - return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, @@ -973,7 +975,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { - unsigned int hash = inet6_addr_hash(&ifa->addr); + unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr); int err = 0; spin_lock(&addrconf_hash_lock); @@ -1838,8 +1840,8 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict, u32 banned_flags) { + unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp; - unsigned int hash = inet6_addr_hash(addr); u32 ifp_flags; rcu_read_lock_bh(); @@ -1917,8 +1919,8 @@ EXPORT_SYMBOL(ipv6_chk_prefix); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { + unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp, *result = NULL; - unsigned int hash = inet6_addr_hash(addr); rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { @@ -4242,9 +4244,9 @@ void if6_proc_exit(void) /* Check if address is a home address configured on any interface. */ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) { - int ret = 0; + unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp = NULL; - unsigned int hash = inet6_addr_hash(addr); + int ret = 0; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { -- cgit v1.2.3 From 480318a0a4d8b0d03ddedba1bd82e78050ede661 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:48 -0700 Subject: ipv6: addrconf: do not block BH in ipv6_chk_addr_and_flags() rcu_read_lock() is enough here, as inet6_ifa_finish_destroy() uses kfree_rcu() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d70d98122053..a6cf37b7e34c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1844,7 +1844,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, struct inet6_ifaddr *ifp; u32 ifp_flags; - rcu_read_lock_bh(); + rcu_read_lock(); hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; @@ -1858,12 +1858,12 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, !(ifp_flags&banned_flags) && (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { - rcu_read_unlock_bh(); + rcu_read_unlock(); return 1; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } EXPORT_SYMBOL(ipv6_chk_addr_and_flags); -- cgit v1.2.3 From 24f226da96278bf3956dac6fa293ef95bc22a90c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:49 -0700 Subject: ipv6: addrconf: do not block BH in ipv6_get_ifaddr() rcu_read_lock() is enough here, no need to block BH. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a6cf37b7e34c..6c1e7ffb62ff 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1922,8 +1922,8 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp, *result = NULL; - rcu_read_lock_bh(); - hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { + rcu_read_lock(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { @@ -1935,7 +1935,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add } } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return result; } -- cgit v1.2.3 From a5c1d98f8ccf4359575772b36ed51f5857dd7165 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:50 -0700 Subject: ipv6: addrconf: do not block BH in /proc/net/if_inet6 handling Table is really RCU protected, no need to block BH Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c1e7ffb62ff..9232e9537082 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4097,9 +4097,9 @@ struct if6_iter_state { static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) { - struct inet6_ifaddr *ifa = NULL; struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + struct inet6_ifaddr *ifa = NULL; int p = 0; /* initial bucket if pos is 0 */ @@ -4109,7 +4109,7 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) } for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { - hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket], + hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; @@ -4135,7 +4135,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - hlist_for_each_entry_continue_rcu_bh(ifa, addr_lst) { + hlist_for_each_entry_continue_rcu(ifa, addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; state->offset++; @@ -4144,7 +4144,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, while (++state->bucket < IN6_ADDR_HSIZE) { state->offset = 0; - hlist_for_each_entry_rcu_bh(ifa, + hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; @@ -4157,9 +4157,9 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, } static void *if6_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(rcu_bh) + __acquires(rcu) { - rcu_read_lock_bh(); + rcu_read_lock(); return if6_get_first(seq, *pos); } @@ -4173,9 +4173,9 @@ static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void if6_seq_stop(struct seq_file *seq, void *v) - __releases(rcu_bh) + __releases(rcu) { - rcu_read_unlock_bh(); + rcu_read_unlock(); } static int if6_seq_show(struct seq_file *seq, void *v) -- cgit v1.2.3 From 4e5f47ab97ce68d9f766dfedc5762940a90e4c11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:51 -0700 Subject: ipv6: addrconf: do not block BH in ipv6_chk_home_addr() rcu_read_lock() is enough here, no need to block BH. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9232e9537082..5a8a10229a07 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4248,8 +4248,8 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) struct inet6_ifaddr *ifp = NULL; int ret = 0; - rcu_read_lock_bh(); - hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { + rcu_read_lock(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && @@ -4258,7 +4258,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) break; } } - rcu_read_unlock_bh(); + rcu_read_unlock(); return ret; } #endif -- cgit v1.2.3 From 87b1af8dcc084d3d509d55067185ca74aed70ce2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 23 Oct 2017 14:59:35 -0700 Subject: ipv6: add ip6_null_entry check in rt6_select() In rt6_select(), fn->leaf could be pointing to net->ipv6.ip6_null_entry. In this case, we should directly return instead of trying to carry on with the rest of the process. If not, we could crash at: spin_lock_bh(&leaf->rt6i_table->rt6_lock); because net->ipv6.ip6_null_entry does not have rt6i_table set. Syzkaller recently reported following issue on net-next: Use struct sctp_sack_info instead kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: sctp: [Deprecated]: syz-executor4 (pid 26496) Use of struct sctp_assoc_value in delayed_ack socket option. Use struct sctp_sack_info instead CPU: 1 PID: 26523 Comm: syz-executor6 Not tainted 4.14.0-rc4+ #85 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d147e3c0 task.stack: ffff8801a4328000 RIP: 0010:debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline] RIP: 0010:do_raw_spin_lock+0x23/0x1e0 kernel/locking/spinlock_debug.c:112 RSP: 0018:ffff8801a432ed70 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000 RDX: 0000000000000003 RSI: 0000000000000000 RDI: 000000000000001c RBP: ffff8801a432ed90 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff8482b279 R12: ffff8801ce2ff3a0 sctp: [Deprecated]: syz-executor1 (pid 26546) Use of int in maxseg socket option. Use struct sctp_assoc_value instead R13: dffffc0000000000 R14: ffff8801d971e000 R15: ffff8801ce2ff0d8 FS: 00007f56e82f5700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001ddbc22000 CR3: 00000001a4a04000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline] _raw_spin_lock_bh+0x39/0x40 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:321 [inline] rt6_select net/ipv6/route.c:786 [inline] ip6_pol_route+0x1be3/0x3bd0 net/ipv6/route.c:1650 sctp: [Deprecated]: syz-executor1 (pid 26576) Use of int in maxseg socket option. Use struct sctp_assoc_value instead TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies. Check SNMP counters. ip6_pol_route_output+0x4c/0x60 net/ipv6/route.c:1843 fib6_rule_lookup+0x9e/0x2a0 net/ipv6/ip6_fib.c:309 ip6_route_output_flags+0x1f1/0x2b0 net/ipv6/route.c:1871 ip6_route_output include/net/ip6_route.h:80 [inline] ip6_dst_lookup_tail+0x4ea/0x970 net/ipv6/ip6_output.c:953 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1076 sctp_v6_get_dst+0x675/0x1c30 net/sctp/ipv6.c:274 sctp_transport_route+0xa8/0x430 net/sctp/transport.c:287 sctp_assoc_add_peer+0x4fe/0x1100 net/sctp/associola.c:656 __sctp_connect+0x251/0xc80 net/sctp/socket.c:1187 sctp_connect+0xb4/0xf0 net/sctp/socket.c:4209 inet_dgram_connect+0x16b/0x1f0 net/ipv4/af_inet.c:541 SYSC_connect+0x20a/0x480 net/socket.c:1642 SyS_connect+0x24/0x30 net/socket.c:1623 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 46c59a53c53f..605e5dc1c010 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -752,7 +752,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, bool do_rr = false; int key_plen; - if (!leaf) + if (!leaf || leaf == net->ipv6.ip6_null_entry) return net->ipv6.ip6_null_entry; rt0 = rcu_dereference(fn->rr_ptr); -- cgit v1.2.3 From c4f3db15958277c03d1c324894255ea3ecbf86e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:40 +0200 Subject: netfilter: conntrack: add and use nf_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. As a first step, add and use a new log function for this, similar to nf_ct_helper_log(). Add __cold annotation -- invalid packets should be infrequent so gcc can consider all call paths that lead to such a function as unlikely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 10 +++++++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 18 ++++++------ net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 14 +++++---- net/netfilter/nf_conntrack_proto.c | 24 ++++++++++++++++ net/netfilter/nf_conntrack_proto_dccp.c | 3 +- net/netfilter/nf_conntrack_proto_sctp.c | 3 +- net/netfilter/nf_conntrack_proto_tcp.c | 22 +++++++------- net/netfilter/nf_conntrack_proto_udp.c | 40 ++++++++++++-------------- 8 files changed, 82 insertions(+), 52 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 738a0307a96b..6d79a061d360 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -152,8 +152,18 @@ extern const struct nla_policy nf_ct_port_nla_policy[]; #define LOG_INVALID(net, proto) \ ((net)->ct.sysctl_log_invalid == (proto) || \ (net)->ct.sysctl_log_invalid == IPPROTO_RAW) + +__printf(5, 6) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, + struct net *net, + u16 pf, u8 protonum, + const char *fmt, ...); #else static inline int LOG_INVALID(struct net *net, int proto) { return 0; } + +static inline __printf(5, 6) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, + u16 pf, u8 protonum, const char *fmt, ...) {} #endif /* CONFIG_SYSCTL */ #endif /*_NF_CONNTRACK_PROTOCOL_H*/ diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a046c298413a..7281a7b77a0e 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -165,6 +165,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + /* Small and modified version of icmp_rcv */ static int icmp_error(struct net *net, struct nf_conn *tmpl, @@ -177,18 +183,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, - NULL, "nf_ct_icmp: short packet "); + icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: bad HW ICMP checksum "); + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -199,9 +201,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: invalid ICMP type "); + icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index a9e1fd1a8536..0f227ca4a5a2 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -176,6 +176,12 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } +static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); +} + static int icmpv6_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -187,17 +193,13 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: short packet "); + icmpv6_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: ICMPv6 checksum failed "); + icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b3e489c859ec..bcd3ee270d75 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -27,6 +27,7 @@ #include #include #include +#include static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; @@ -63,6 +64,29 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, *header = NULL; *table = NULL; } + +__printf(5, 6) +void nf_l4proto_log_invalid(const struct sk_buff *skb, + struct net *net, + u16 pf, u8 protonum, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (net->ct.sysctl_log_invalid != protonum || + net->ct.sysctl_log_invalid != IPPROTO_RAW) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_proto_%d: %pV ", protonum, &vaf); + va_end(args); +} +EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); #endif const struct nf_conntrack_l4proto * diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 0f5a4d79f6b8..ef501c7edb96 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -604,8 +604,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; out_invalid: - if (LOG_INVALID(net, IPPROTO_DCCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg); + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 6303a88af12b..aa630c561361 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -522,8 +522,7 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, } return NF_ACCEPT; out_invalid: - if (LOG_INVALID(net, IPPROTO_SCTP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg); + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index cba1c6ffe51a..14198b2a2e2c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -738,6 +738,12 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| [TCPHDR_ACK|TCPHDR_URG] = 1, }; +static void tcp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); +} + /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static int tcp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, @@ -753,17 +759,13 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: short packet "); + tcp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: truncated/malformed packet "); + tcp_error_log(skb, net, pf, "truncated packet"); return -NF_ACCEPT; } @@ -774,18 +776,14 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, /* FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: bad TCP checksum "); + tcp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(net, IPPROTO_TCP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid TCP flag combination "); + tcp_error_log(skb, net, pf, "invalid tcp flag combination"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8af734cd1a94..fc20cf430251 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -99,6 +99,12 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, } #ifdef CONFIG_NF_CT_PROTO_UDPLITE +static void udplite_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg); +} + static int udplite_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -112,9 +118,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: short packet "); + udplite_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } @@ -122,17 +126,13 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: invalid checksum coverage "); + udplite_error_log(skb, net, pf, "invalid checksum coverage"); return -NF_ACCEPT; } /* UDPLITE mandates checksums */ if (!hdr->check) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: checksum missing "); + udplite_error_log(skb, net, pf, "checksum missing"); return -NF_ACCEPT; } @@ -140,9 +140,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: bad UDPLite checksum "); + udplite_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } @@ -150,6 +148,12 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, } #endif +static void udp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg); +} + static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t pf, @@ -162,17 +166,13 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: short packet "); + udp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: truncated/malformed packet "); + udp_error_log(skb, net, pf, "truncated/malformed packet"); return -NF_ACCEPT; } @@ -186,9 +186,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, * FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - if (LOG_INVALID(net, IPPROTO_UDP)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udp: bad UDP checksum "); + udp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } -- cgit v1.2.3 From eb6fad5a4a328b85d3faa8b301b522e3f316b49d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:42 +0200 Subject: netfilter: conntrack: remove pf argument from l4 packet functions not needed/used anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 1 - net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 1 - net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 1 - net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_proto_dccp.c | 1 - net/netfilter/nf_conntrack_proto_generic.c | 1 - net/netfilter/nf_conntrack_proto_gre.c | 1 - net/netfilter/nf_conntrack_proto_sctp.c | 1 - net/netfilter/nf_conntrack_proto_tcp.c | 6 ++---- net/netfilter/nf_conntrack_proto_udp.c | 1 - 10 files changed, 3 insertions(+), 13 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 5d51255b5bfb..e06518874144 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -42,7 +42,6 @@ struct nf_conntrack_l4proto { const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts); /* Called when a new connection for this protocol found; diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7281a7b77a0e..8969420cecc3 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 0f227ca4a5a2..dca921df28e1 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -94,7 +94,6 @@ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 01130392b7c0..28e675150853 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1419,7 +1419,7 @@ repeat: /* Decide what timeout policy we want to apply to this flow. */ timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts); + ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 49e0abcdc6f4..2a446f4a554c 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -467,7 +467,6 @@ static unsigned int *dccp_get_timeouts(struct net *net) static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 9cd40700842e..1f86ddf6649a 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -60,7 +60,6 @@ static int generic_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 09a90484c27d..a2503005d80b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -244,7 +244,6 @@ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { /* If we've seen traffic both ways, this is a GRE connection. diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index aa630c561361..80faf04ddf15 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -306,7 +306,6 @@ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { enum sctp_conntrack new_state, old_state; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index dced574f6006..8f283294d70f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -493,8 +493,7 @@ static bool tcp_in_window(const struct nf_conn *ct, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *tcph, - u_int8_t pf) + const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); @@ -801,7 +800,6 @@ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { struct net *net = nf_ct_net(ct); @@ -1013,7 +1011,7 @@ static int tcp_packet(struct nf_conn *ct, } if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th, pf)) { + skb, dataoff, th)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index fc20cf430251..3a5f727103af 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -73,7 +73,6 @@ static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts) { /* If we've seen traffic both ways, this is some kind of UDP -- cgit v1.2.3 From d13e7b2e65f6dfbe97b845d75741a970181b9fec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 01:13:51 +0200 Subject: netfilter: x_tables: don't use seqlock when fetching old counters after previous commit xt_replace_table will wait until all cpus had even seqcount (i.e., no cpu is accessing old ruleset). Add a 'old' counter retrival version that doesn't synchronize counters. Its not needed, the old counters are not in use anymore at this point. This speeds up table replacement on busy systems with large tables (and many cores). Cc: Dan Williams Cc: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 22 ++++++++++++++++++++-- net/ipv4/netfilter/ip_tables.c | 23 +++++++++++++++++++++-- net/ipv6/netfilter/ip6_tables.c | 22 ++++++++++++++++++++-- 3 files changed, 61 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 9e2770fd00be..f88221aebc9d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -634,6 +634,25 @@ static void get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct arpt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -910,8 +929,7 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 39286e543ee6..4cbe5e80f3bf 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -781,6 +781,26 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ipt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; /* macro does multi eval of i */ + } + + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1070,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 01bd3ee5ebc6..f06e25065a34 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -800,6 +800,25 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ip6t_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1090,8 +1109,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) -- cgit v1.2.3 From 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 09:38:30 +0200 Subject: netfilter: conntrack: make l3proto trackers const previous patches removed all writes to them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 2 +- include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 2 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 919e4e8af327..5534ecca7a5d 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -10,7 +10,7 @@ #define _NF_CONNTRACK_IPV4_H -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index eaea968f8657..30dc57980866 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -1,7 +1,7 @@ #ifndef _NF_CONNTRACK_IPV6_H #define _NF_CONNTRACK_IPV6_H -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fe374da4bc13..89af9d88ca21 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv4_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index fe01dc953c56..3b80a38f62b8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -339,7 +339,7 @@ static void ipv6_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv6_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, -- cgit v1.2.3 From 908d140a87a794bf89717ceae54aba5ce86c52e4 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sat, 21 Oct 2017 00:25:15 +0300 Subject: ip6_tunnel: Allow rcv/xmit even if remote address is a local address Currently, ip6_tnl_xmit_ctl drops tunneled packets if the remote address (outer v6 destination) is one of host's locally configured addresses. Same applies to ip6_tnl_rcv_ctl: it drops packets if the remote address (outer v6 source) is a local address. This prevents using ipxip6 (and ip6_gre) tunnels whose local/remote endpoints are on same host; OTOH v4 tunnels (ipip or gre) allow such configurations. An example where this proves useful is a system where entities are identified by their unique v6 addresses, and use tunnels to encapsulate traffic between them. The limitation prevents placing several entities on same host. Introduce IP6_TNL_F_ALLOW_LOCAL_REMOTE which allows to bypass this restriction. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/uapi/linux/ip6_tunnel.h | 2 ++ net/ipv6/ip6_tunnel.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 425926c467d7..ffebbe365478 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -20,6 +20,8 @@ #define IP6_TNL_F_RCV_DSCP_COPY 0x10 /* copy fwmark from inner packet */ #define IP6_TNL_F_USE_ORIG_FWMARK 0x20 +/* allow remote endpoint on the local node */ +#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0x40 struct ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4212879ff35e..439d65f7e094 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -770,7 +770,8 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr(net, laddr, ldev, 0))) && - likely(!ipv6_chk_addr(net, raddr, NULL, 0))) + ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || + likely(!ipv6_chk_addr(net, raddr, NULL, 0)))) ret = 1; } return ret; @@ -1000,7 +1001,8 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) pr_warn("%s xmit: Local address not yet configured!\n", p->name); - else if (!ipv6_addr_is_multicast(raddr) && + else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && + !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", p->name); -- cgit v1.2.3 From eee12df5a0bd5769af5efb72fa95dd1f633a266c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 26 Oct 2017 07:51:06 -0500 Subject: ipv6: esp6: use BUG_ON instead of if condition followed by BUG Use BUG_ON instead of if condition followed by BUG in esp_remove_trailer. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/ipv6/esp6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 1696401fed6c..4000b71bfdc5 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,8 +483,8 @@ static inline int esp_remove_trailer(struct sk_buff *skb) goto out; } - if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) - BUG(); + ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2); + BUG_ON(ret); ret = -EINVAL; padlen = nexthdr[0]; -- cgit v1.2.3 From 74b6551b9f4106a3e69ab46e60b52a8947dcf60c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 26 Oct 2017 23:10:35 -0500 Subject: ipv6: exthdrs: use swap macro in ipv6_dest_hao make use of the swap macro and remove unnecessary variable tmp_addr. This makes the code easier to read and maintain. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 7835dea930b4..9f918a770f87 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -187,7 +187,6 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); - struct in6_addr tmp_addr; int ret; if (opt->dsthao) { @@ -229,9 +228,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; - tmp_addr = ipv6h->saddr; - ipv6h->saddr = hao->addr; - hao->addr = tmp_addr; + swap(ipv6h->saddr, hao->addr); if (skb->tstamp == 0) __net_timestamp(skb); -- cgit v1.2.3 From ceef9ab6be7234f9e49f79769e0da88d1dccfcc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:24 -0700 Subject: tcp: Namespace-ify sysctl_tcp_workaround_signed_windows Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 4 ++-- net/ipv4/syncookies.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 14 +++++--------- net/ipv6/syncookies.c | 2 +- 7 files changed, 18 insertions(+), 21 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 60bccda046db..e74c7c1b0d18 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -145,6 +145,7 @@ struct netns_ipv4 { int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; + int sysctl_tcp_workaround_signed_windows; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index ed0828dc82f1..e338e16178dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_workaround_signed_windows; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ @@ -1302,7 +1301,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) } /* Determine a window scaling and initial window to offer. */ -void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, +void tcp_select_initial_window(const struct sock *sk, int __space, + __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 77cf32a80952..fda37f2862c9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a053cacb8290..3ae9012a4979 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -457,13 +457,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, }, - { - .procname = "tcp_workaround_signed_windows", - .data = &sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_limit_output_bytes", .data = &sysctl_tcp_limit_output_bytes, @@ -1145,6 +1138,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_workaround_signed_windows", + .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3270ab8416ce..3c65c1a3f944 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -369,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(full_space, + tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 60df3ab52166..5bbed67c27e9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,11 +45,6 @@ #include -/* People can turn this on to work with those rare, broken TCPs that - * interpret the window field as a signed quantity. - */ -int sysctl_tcp_workaround_signed_windows __read_mostly = 0; - /* Default TSQ limit of four TSO segments */ int sysctl_tcp_limit_output_bytes __read_mostly = 262144; @@ -196,7 +191,7 @@ u32 tcp_default_init_rwnd(u32 mss) * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ -void tcp_select_initial_window(int __space, __u32 mss, +void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) @@ -220,7 +215,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sysctl_tcp_workaround_signed_windows) + if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; @@ -280,7 +275,8 @@ static u16 tcp_select_window(struct sock *sk) /* Make sure we do not exceed the maximum possible * scaled window. */ - if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -3349,7 +3345,7 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 4e7817abc0b9..e7a3a6b6cf56 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -244,7 +244,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); -- cgit v1.2.3 From 2ea2352ede9d97585164a7e19224955f4e4ca8db Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 27 Oct 2017 17:30:12 -0700 Subject: ipv6: prevent user from adding cached routes Cached routes should only be created by the system when receiving pmtu discovery or ip redirect msg. Users should not be allowed to create cached routes. Furthermore, after the patch series to move cached routes into exception table, user added cached routes will trigger the following warning in fib6_add(): WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137 fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297 RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814 RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780 RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360 R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521 sock_do_ioctl+0x65/0xb0 net/socket.c:961 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe So we fix this by failing the attemp to add cached routes from userspace with returning EINVAL error. Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 2 +- net/ipv6/route.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index d496c02e14bc..c15d8054905c 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -28,7 +28,7 @@ #define RTF_ROUTEINFO 0x00800000 /* route information - RA */ -#define RTF_CACHE 0x01000000 /* cache entry */ +#define RTF_CACHE 0x01000000 /* read-only: can not be set by user */ #define RTF_FLOW 0x02000000 /* flow significant route */ #define RTF_POLICY 0x04000000 /* policy route */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 605e5dc1c010..70d9659fc1e9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2478,6 +2478,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; -- cgit v1.2.3 From 6c31e5a91fde2e718e59c8a627c56451f88be54c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 Oct 2017 17:37:13 -0700 Subject: net: Add extack to fib_notifier_info Add extack to fib_notifier_info and plumb through stack to call_fib_rule_notifiers, call_fib_entry_notifiers and call_fib6_entry_notifiers. This allows notifer handlers to return messages to user. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 1 + net/core/fib_rules.c | 9 ++++++--- net/ipv4/fib_trie.c | 13 ++++++++----- net/ipv6/ip6_fib.c | 15 +++++++++------ 4 files changed, 24 insertions(+), 14 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 54cd6b839d2f..c91ec732afd6 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -9,6 +9,7 @@ struct fib_notifier_info { struct net *net; int family; + struct netlink_ext_ack *extack; }; enum fib_event_type { diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a6d97c1d810..fafd0a41e3f7 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -314,10 +314,12 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, - struct fib_rules_ops *ops) + struct fib_rules_ops *ops, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, + .info.extack = extack, .rule = rule, }; @@ -609,7 +611,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -749,7 +751,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, + NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index aaa1ba09afaa..5ddc4aefff12 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -102,9 +102,11 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -1214,7 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa); + key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1269,7 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, new_fa); + call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1569,7 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete); + fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1886,7 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, fa); + KEYLENGTH - fa->fa_slen, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1ada9672d198..2e2804f5823e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -353,9 +353,11 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, - struct rt6_info *rt) + struct rt6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; @@ -868,7 +870,8 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc) + struct nl_info *info, struct mx6_config *mxc, + struct netlink_ext_ack *extack) { struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1011,7 +1014,7 @@ add: rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, - rt); + rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -1040,7 +1043,7 @@ add: rt->dst.rt6_next = iter->dst.rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, - rt); + rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { @@ -1225,7 +1228,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, mxc); + err = fib6_add_rt2node(fn, rt, info, mxc, extack); if (!err) { fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); @@ -1686,7 +1689,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt); + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); -- cgit v1.2.3 From 3051fbec206eb6967b7fdecedb63ebb1ed67a1a7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Oct 2017 10:07:17 -0700 Subject: net: sit: Update lookup to handle links set to L3 slave Using SIT tunnels with VRFs works fine if the underlay device is in a VRF and the link parameter is set to the VRF device. e.g., ip tunnel add jtun mode sit remote local dev myvrf Update the device check to allow the link to be the enslaved device as well. e.g., ip tunnel add jtun mode sit remote local dev eth4 where eth4 is enslaved to myvrf. Reported-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/sit.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a799f5258614..d60ddcb0bfe2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -91,29 +91,35 @@ struct sit_net { * Must be invoked with rcu_read_lock */ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, - struct net_device *dev, __be32 remote, __be32 local) + struct net_device *dev, + __be32 remote, __be32 local, + int sifindex) { unsigned int h0 = HASH(remote); unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); + int ifindex = dev ? dev->ifindex : 0; for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && - (!dev || !t->parms.link || dev->ifindex == t->parms.link) && + (!dev || !t->parms.link || ifindex == t->parms.link || + sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } @@ -486,6 +492,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) const int code = icmp_hdr(skb)->code; unsigned int data_len = 0; struct ip_tunnel *t; + int sifindex; int err; switch (type) { @@ -517,10 +524,9 @@ static int ipip6_err(struct sk_buff *skb, u32 info) err = -ENOENT; - t = ipip6_tunnel_lookup(dev_net(skb->dev), - skb->dev, - iph->daddr, - iph->saddr); + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; + t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->daddr, iph->saddr, sifindex); if (!t) goto out; @@ -633,10 +639,12 @@ static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; + int sifindex; int err; + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, - iph->saddr, iph->daddr); + iph->saddr, iph->daddr, sifindex); if (tunnel) { struct pcpu_sw_netstats *tstats; @@ -704,10 +712,13 @@ static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; + int sifindex; + + sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; iph = ip_hdr(skb); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, - iph->saddr, iph->daddr); + iph->saddr, iph->daddr, sifindex); if (tunnel) { const struct tnl_ptk_info *tpi; -- cgit v1.2.3 From da13c59b9936dfedcf9f2203bd29fbf83ad672bf Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 30 Oct 2017 19:38:52 -0400 Subject: net: display hw address of source machine during ipv6 DAD failure This patch updates the error messages displayed in kernel log to include hwaddress of the source machine that caused ipv6 duplicate address detection failures. Examples: a) When we receive a NA packet from another machine advertising our address: ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2001:db8:: on eth0! b) When we detect DAD failure during address assignment to an interface: IPv6: eth0: IPv6 duplicate address 2001:db8:: used by 34:ab:cd:56:11:e8 detected! v2: Changed %pI6 to %pI6c in ndisc_recv_na() Chaged the v6 address in the commit message to 2001:db8:: Suggested-by: Igor Lubashev Signed-off-by: Vishwanath Pai Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- net/ipv6/addrconf.c | 6 +++--- net/ipv6/ndisc.c | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 15b5ffd7253d..2a616ea53956 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -208,7 +208,7 @@ void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); -void addrconf_dad_failure(struct inet6_ifaddr *ifp); +void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5a8a10229a07..cfa374c8b54c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1987,7 +1987,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp) return err; } -void addrconf_dad_failure(struct inet6_ifaddr *ifp) +void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net *net = dev_net(ifp->idev->dev); @@ -1997,8 +1997,8 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) return; } - net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n", - ifp->idev->dev->name, &ifp->addr); + net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM detected!\n", + ifp->idev->dev->name, &ifp->addr, eth_hdr(skb)->h_source); spin_lock_bh(&ifp->lock); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 266a530414d7..f9c3ffe04382 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -46,6 +46,7 @@ #endif #include +#include #include #include #include @@ -822,7 +823,7 @@ have_ifp: * who is doing DAD * so fail our DAD process */ - addrconf_dad_failure(ifp); + addrconf_dad_failure(skb, ifp); return; } else { /* @@ -975,7 +976,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { - addrconf_dad_failure(ifp); + addrconf_dad_failure(skb, ifp); return; } /* What should we make now? The advertisement @@ -989,8 +990,8 @@ static void ndisc_recv_na(struct sk_buff *skb) */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, - "NA: someone advertises our address %pI6 on %s!\n", - &ifp->addr, ifp->idev->dev->name); + "NA: %pM advertised our address %pI6c on %s!\n", + eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return; } -- cgit v1.2.3 From 47d3d7ac656a1ffb9d0f0d3c845663ed6fd7e78d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Oct 2017 14:16:00 -0700 Subject: ipv6: Implement limits on Hop-by-Hop and Destination options RFC 8200 (IPv6) defines Hop-by-Hop options and Destination options extension headers. Both of these carry a list of TLVs which is only limited by the maximum length of the extension header (2048 bytes). By the spec a host must process all the TLVs in these options, however these could be used as a fairly obvious denial of service attack. I think this could in fact be a significant DOS vector on the Internet, one mitigating factor might be that many FWs drop all packets with EH (and obviously this is only IPv6) so an Internet wide attack might not be so effective (yet!). By my calculation, the worse case packet with TLVs in a standard 1500 byte MTU packet that would be processed by the stack contains 1282 invidual TLVs (including pad TLVS) or 724 two byte TLVs. I wrote a quick test program that floods a whole bunch of these packets to a host and sure enough there is substantial time spent in ip6_parse_tlv. These packets contain nothing but unknown TLVS (that are ignored), TLV padding, and bogus UDP header with zero payload length. 25.38% [kernel] [k] __fib6_clean_all 21.63% [kernel] [k] ip6_parse_tlv 4.21% [kernel] [k] __local_bh_enable_ip 2.18% [kernel] [k] ip6_pol_route.isra.39 1.98% [kernel] [k] fib6_walk_continue 1.88% [kernel] [k] _raw_write_lock_bh 1.65% [kernel] [k] dst_release This patch adds configurable limits to Destination and Hop-by-Hop options. There are three limits that may be set: - Limit the number of options in a Hop-by-Hop or Destination options extension header. - Limit the byte length of a Hop-by-Hop or Destination options extension header. - Disallow unrecognized options in a Hop-by-Hop or Destination options extension header. The limits are set in corresponding sysctls: ipv6.sysctl.max_dst_opts_cnt ipv6.sysctl.max_hbh_opts_cnt ipv6.sysctl.max_dst_opts_len ipv6.sysctl.max_hbh_opts_len If a max_*_opts_cnt is less than zero then unknown TLVs are disallowed. The number of known TLVs that are allowed is the absolute value of this number. If a limit is exceeded when processing an extension header the packet is dropped. Default values are set to 8 for options counts, and set to INT_MAX for maximum length. Note the choice to limit options to 8 is an arbitrary guess (roughly based on the fact that the stack supports three HBH options and just one destination option). These limits have being proposed in draft-ietf-6man-rfc6434-bis. Tested (by Martin Lau) I tested out 1 thread (i.e. one raw_udp process). I changed the net.ipv6.max_dst_(opts|hbh)_number between 8 to 2048. With sysctls setting to 2048, the softirq% is packed to 100%. With 8, the softirq% is almost unnoticable from mpstat. v2; - Code and documention cleanup. - Change references of RFC2460 to be RFC8200. - Add reference to RFC6434-bis where the limits will be in standard. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 24 ++++++++++++ include/net/ipv6.h | 40 ++++++++++++++++++++ include/net/netns/ipv6.h | 4 ++ net/ipv6/af_inet6.c | 4 ++ net/ipv6/exthdrs.c | 67 ++++++++++++++++++++++++++++------ net/ipv6/sysctl_net_ipv6.c | 32 ++++++++++++++++ 6 files changed, 159 insertions(+), 12 deletions(-) (limited to 'net/ipv6') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 77f4de59dc9c..e6661b205f72 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1385,6 +1385,30 @@ mld_qrv - INTEGER Default: 2 (as specified by RFC3810 9.1) Minimum: 1 (as specified by RFC6636 4.5) +max_dst_opts_cnt - INTEGER + Maximum number of non-padding TLVs allowed in a Destination + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + Default: 8 + +max_hbh_opts_cnt - INTEGER + Maximum number of non-padding TLVs allowed in a Hop-by-Hop + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + Default: 8 + +max dst_opts_len - INTEGER + Maximum length allowed for a Destination options extension + header. + Default: INT_MAX (unlimited) + +max hbh_opts_len - INTEGER + Maximum length allowed for a Hop-by-Hop options extension + header. + Default: INT_MAX (unlimited) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3cda3b521c36..fb6d67012de6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -51,6 +51,46 @@ #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 +/* Limits on Hop-by-Hop and Destination options. + * + * Per RFC8200 there is no limit on the maximum number or lengths of options in + * Hop-by-Hop or Destination options other then the packet must fit in an MTU. + * We allow configurable limits in order to mitigate potential denial of + * service attacks. + * + * There are three limits that may be set: + * - Limit the number of options in a Hop-by-Hop or Destination options + * extension header + * - Limit the byte length of a Hop-by-Hop or Destination options extension + * header + * - Disallow unknown options + * + * The limits are expressed in corresponding sysctls: + * + * ipv6.sysctl.max_dst_opts_cnt + * ipv6.sysctl.max_hbh_opts_cnt + * ipv6.sysctl.max_dst_opts_len + * ipv6.sysctl.max_hbh_opts_len + * + * max_*_opts_cnt is the number of TLVs that are allowed for Destination + * options or Hop-by-Hop options. If the number is less than zero then unknown + * TLVs are disallowed and the number of known options that are allowed is the + * absolute value. Setting the value to INT_MAX indicates no limit. + * + * max_*_opts_len is the length limit in bytes of a Destination or + * Hop-by-Hop options extension header. Setting the value to INT_MAX + * indicates no length limit. + * + * If a limit is exceeded when processing an extension header the packet is + * silently discarded. + */ + +/* Default limits for Hop-by-Hop and Destination options */ +#define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ +#define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ + /* * Addr type * diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2ea1ed341ef8..600ba1c1befc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -37,6 +37,10 @@ struct netns_sysctl_ipv6 { int idgen_delay; int flowlabel_state_ranges; int flowlabel_reflect; + int max_dst_opts_cnt; + int max_hbh_opts_cnt; + int max_dst_opts_len; + int max_hbh_opts_len; }; struct netns_ipv6 { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fe5262fd6aa5..c26f71234b9c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -810,6 +810,10 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; net->ipv6.sysctl.flowlabel_state_ranges = 0; + net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT; + net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; + net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; + net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 9f918a770f87..83bd75713535 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -74,8 +74,20 @@ struct tlvtype_proc { /* An unknown option is detected, decide what to do */ -static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, + bool disallow_unknowns) { + if (disallow_unknowns) { + /* If unknown TLVs are disallowed by configuration + * then always silently drop packet. Note this also + * means no ICMP parameter problem is sent which + * could be a good property to mitigate a reflection DOS + * attack. + */ + + goto drop; + } + switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; @@ -95,20 +107,30 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) return false; } +drop: kfree_skb(skb); return false; } /* Parse tlv encoded option header (hop-by-hop or destination) */ -static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) +static bool ip6_parse_tlv(const struct tlvtype_proc *procs, + struct sk_buff *skb, + int max_count) { - const struct tlvtype_proc *curr; + int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); - int len = (skb_transport_header(skb)[1] + 1) << 3; + const struct tlvtype_proc *curr; + bool disallow_unknowns = false; + int tlv_count = 0; int padlen = 0; + if (unlikely(max_count < 0)) { + disallow_unknowns = true; + max_count = -max_count; + } + if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; @@ -149,6 +171,11 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) default: /* Other TLV code so scan list */ if (optlen > len) goto bad; + + tlv_count++; + if (tlv_count > max_count) + goto bad; + for (curr = procs; curr->type >= 0; curr++) { if (curr->type == nh[off]) { /* type specific length/alignment @@ -159,10 +186,10 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) break; } } - if (curr->type < 0) { - if (ip6_tlvopt_unknown(skb, off) == 0) - return false; - } + if (curr->type < 0 && + !ip6_tlvopt_unknown(skb, off, disallow_unknowns)) + return false; + padlen = 0; break; } @@ -258,23 +285,31 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(skb->dev); + int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); +fail_and_free: kfree_skb(skb); return -1; } + extlen = (skb_transport_header(skb)[1] + 1) << 3; + if (extlen > net->ipv6.sysctl.max_dst_opts_len) + goto fail_and_free; + opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { - skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + if (ip6_parse_tlv(tlvprocdestopt_lst, skb, + init_net.ipv6.sysctl.max_dst_opts_cnt)) { + skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; @@ -803,6 +838,8 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); + int extlen; /* * skb_network_header(skb) is equal to skb->data, and @@ -813,13 +850,19 @@ int ipv6_parse_hopopts(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { +fail_and_free: kfree_skb(skb); return -1; } + extlen = (skb_transport_header(skb)[1] + 1) << 3; + if (extlen > net->ipv6.sysctl.max_hbh_opts_len) + goto fail_and_free; + opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { - skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + if (ip6_parse_tlv(tlvprochopopt_lst, skb, + init_net.ipv6.sysctl.max_hbh_opts_cnt)) { + skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 6fbf8ae5e52c..4a2f0fd870bc 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -97,6 +97,34 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "max_dst_opts_number", + .data = &init_net.ipv6.sysctl.max_dst_opts_cnt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_hbh_opts_number", + .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_dst_opts_length", + .data = &init_net.ipv6.sysctl.max_dst_opts_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_hbh_length", + .data = &init_net.ipv6.sysctl.max_hbh_opts_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -157,6 +185,10 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect; + ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt; + ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; + ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; + ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3 From 27c565ae9d554fa1c00c799754cff43476c8d3b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Nov 2017 08:53:27 -0700 Subject: ipv6: remove IN6_ADDR_HSIZE from addrconf.h IN6_ADDR_HSIZE is private to addrconf.c, move it here to avoid confusion. Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 3 --- net/ipv6/addrconf.c | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 3357332ea375..b623b65a79d1 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -59,9 +59,6 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; -#define IN6_ADDR_HSIZE_SHIFT 8 -#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) - int addrconf_init(void); void addrconf_cleanup(void); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 69b8cdb43aa2..66d8c3d912fd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -157,6 +157,8 @@ static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); +#define IN6_ADDR_HSIZE_SHIFT 8 +#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) /* * Configured unicast address hash table */ -- cgit v1.2.3 From 5caaed151a68ae36aca2981cc245f5960a0a7603 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Nov 2017 19:41:09 +0100 Subject: netfilter: conntrack: don't cache nlattr_tuple_size result in nla_size We currently call ->nlattr_tuple_size() once at register time and cache result in l4proto->nla_size. nla_size is the only member that is written to, avoiding this would allow to make l4proto trackers const. We can use ->nlattr_tuple_size() at run time, and cache result in the individual trackers instead. This is an intermediate step, next patch removes nlattr_size() callback and computes size at compile time, then removes nla_size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 4 ++-- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 9 +++++++-- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 9 +++++++-- net/netfilter/nf_conntrack_core.c | 9 +++++++-- net/netfilter/nf_conntrack_netlink.c | 10 +++++++--- net/netfilter/nf_conntrack_proto.c | 2 -- net/netfilter/nf_conntrack_proto_tcp.c | 9 +++++++-- 7 files changed, 37 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index e06518874144..46e786ffcf2f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -74,7 +74,7 @@ struct nf_conntrack_l4proto { int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ - int (*nlattr_tuple_size)(void); + unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t); const struct nla_policy *nla_policy; @@ -144,7 +144,7 @@ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t); -int nf_ct_port_nlattr_tuple_size(void); +unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 8969420cecc3..1849fedd9b81 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -258,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmp_nlattr_tuple_size(void) +static unsigned int icmp_nlattr_tuple_size(void) { - return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index dca921df28e1..3ac0d826afc4 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -259,9 +259,14 @@ static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmpv6_nlattr_tuple_size(void) +static unsigned int icmpv6_nlattr_tuple_size(void) { - return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 28e675150853..0e516947c16f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1563,9 +1563,14 @@ int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); -int nf_ct_port_nlattr_tuple_size(void) +unsigned int nf_ct_port_nlattr_tuple_size(void) { - return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + + return size; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index de4053d84364..6e0adfefb9ed 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -533,11 +533,11 @@ nla_put_failure: return -1; } -static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) +static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; - size_t len; + size_t len, len4 = 0; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); len = l3proto->nla_size; @@ -545,8 +545,12 @@ static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); len += l4proto->nla_size; + if (l4proto->nlattr_tuple_size) { + len4 = l4proto->nlattr_tuple_size(); + len4 *= 3u; /* ORIG, REPLY, MASTER */ + } - return len; + return len + len4; } static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 83f739e9dc08..3b06ff3f2dee 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -398,8 +398,6 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto) l4proto->nla_size = 0; if (l4proto->nlattr_size) l4proto->nla_size += l4proto->nlattr_size(); - if (l4proto->nlattr_tuple_size) - l4proto->nla_size += 3 * l4proto->nlattr_tuple_size(); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], l4proto); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 8f283294d70f..b12fc07111d0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1277,9 +1277,14 @@ static int tcp_nlattr_size(void) + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); } -static int tcp_nlattr_tuple_size(void) +static unsigned int tcp_nlattr_tuple_size(void) { - return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif -- cgit v1.2.3 From fffcefe967a02997be7a296a4f0766b29dcd1a67 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Nov 2017 14:13:29 -0800 Subject: ipv6: addrconf: fix a lockdep splat Fixes a case where GFP_ATOMIC allocation must be used instead of GFP_KERNEL one. [ 54.891146] lock_acquire+0xb3/0x2f0 [ 54.891153] ? fs_reclaim_acquire.part.60+0x5/0x30 [ 54.891165] fs_reclaim_acquire.part.60+0x29/0x30 [ 54.891170] ? fs_reclaim_acquire.part.60+0x5/0x30 [ 54.891178] kmem_cache_alloc_trace+0x3f/0x500 [ 54.891186] ? cyc2ns_read_end+0x1e/0x30 [ 54.891196] ipv6_add_addr+0x15a/0xc30 [ 54.891217] ? ipv6_create_tempaddr+0x2ea/0x5d0 [ 54.891223] ipv6_create_tempaddr+0x2ea/0x5d0 [ 54.891238] ? manage_tempaddrs+0x195/0x220 [ 54.891249] ? addrconf_prefix_rcv_add_addr+0x1c0/0x4f0 [ 54.891255] addrconf_prefix_rcv_add_addr+0x1c0/0x4f0 [ 54.891268] addrconf_prefix_rcv+0x2e5/0x9b0 [ 54.891279] ? neigh_update+0x446/0xb90 [ 54.891298] ? ndisc_router_discovery+0x5ab/0xf00 [ 54.891303] ndisc_router_discovery+0x5ab/0xf00 [ 54.891311] ? retint_kernel+0x2d/0x2d [ 54.891331] ndisc_rcv+0x1b6/0x270 [ 54.891340] icmpv6_rcv+0x6aa/0x9f0 [ 54.891345] ? ipv6_chk_mcast_addr+0x176/0x530 [ 54.891351] ? do_csum+0x17b/0x260 [ 54.891360] ip6_input_finish+0x194/0xb20 [ 54.891372] ip6_input+0x5b/0x2c0 [ 54.891380] ? ip6_rcv_finish+0x320/0x320 [ 54.891389] ip6_mc_input+0x15a/0x250 [ 54.891396] ipv6_rcv+0x772/0x1050 [ 54.891403] ? consume_skb+0xbe/0x2d0 [ 54.891412] ? ip6_make_skb+0x2a0/0x2a0 [ 54.891418] ? ip6_input+0x2c0/0x2c0 [ 54.891425] __netif_receive_skb_core+0xa0f/0x1600 [ 54.891436] ? process_backlog+0xac/0x400 [ 54.891441] process_backlog+0xfa/0x400 [ 54.891448] ? net_rx_action+0x145/0x1130 [ 54.891456] net_rx_action+0x310/0x1130 [ 54.891524] ? RTUSBBulkReceive+0x11d/0x190 [mt7610u_sta] [ 54.891538] __do_softirq+0x140/0xaba [ 54.891553] irq_exit+0x10b/0x160 [ 54.891561] do_IRQ+0xbb/0x1b0 Fixes: f3d9832e56c4 ("ipv6: addrconf: cleanup locking in ipv6_add_addr") Signed-off-by: Eric Dumazet Reported-by: Valdis Kletnieks Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 66d8c3d912fd..6233e06fa35c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1267,7 +1267,9 @@ out: in6_ifa_put(ifp); } -static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, + struct inet6_ifaddr *ift, + bool block) { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; @@ -1371,7 +1373,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft, true, NULL); + tmp_valid_lft, tmp_prefered_lft, block, NULL); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -1956,7 +1958,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) if (ifpub) { in6_ifa_hold(ifpub); spin_unlock_bh(&ifp->lock); - ipv6_create_tempaddr(ifpub, ifp); + ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); } else { spin_unlock_bh(&ifp->lock); @@ -2456,7 +2458,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, * no temporary address currently exists. */ read_unlock_bh(&idev->lock); - ipv6_create_tempaddr(ifp, NULL); + ipv6_create_tempaddr(ifp, NULL, false); } else { read_unlock_bh(&idev->lock); } @@ -4351,7 +4353,7 @@ restart: spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); - ipv6_create_tempaddr(ifpub, ifp); + ipv6_create_tempaddr(ifpub, ifp, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); goto restart; -- cgit v1.2.3 From 80661e7687f202514ee5eea0e74916d3c50c2606 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:22 -0800 Subject: ila: cleanup checksum diff Consolidate computing checksum diff into one function. Add get_csum_diff_iaddr that computes the checksum diff between an address argument and locator being written. get_csum_diff calls this using the destination address in the IP header as the argument. Also moved ila_init_saved_csum to be close to the checksum diff functions. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila/ila_common.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index aba0998ddbfb..f1d9248d8b86 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -13,15 +13,28 @@ #include #include "ila.h" -static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +void ila_init_saved_csum(struct ila_params *p) { - struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + if (!p->locator_match.v64) + return; + + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator, + (__be32 *)&p->locator_match); +} +static __wsum get_csum_diff_iaddr(struct ila_addr *iaddr, struct ila_params *p) +{ if (p->locator_match.v64) return p->csum_diff; else - return compute_csum_diff8((__be32 *)&iaddr->loc, - (__be32 *)&p->locator); + return compute_csum_diff8((__be32 *)&p->locator, + (__be32 *)&iaddr->loc); +} + +static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +{ + return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); } static void ila_csum_do_neutral(struct ila_addr *iaddr, @@ -30,13 +43,7 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; - /* Check if checksum adjust value has been cached */ - if (p->locator_match.v64) { - diff = p->csum_diff; - } else { - diff = compute_csum_diff8((__be32 *)&p->locator, - (__be32 *)iaddr); - } + diff = get_csum_diff_iaddr(iaddr, p); fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); @@ -134,16 +141,6 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, iaddr->loc = p->locator; } -void ila_init_saved_csum(struct ila_params *p) -{ - if (!p->locator_match.v64) - return; - - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator, - (__be32 *)&p->locator_match); -} - static int __init ila_init(void) { int ret; -- cgit v1.2.3 From 84287bb3285634b60c55c00a1d5ed843b44fde92 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:23 -0800 Subject: ila: add checksum neutral map auto Add checksum neutral auto that performs checksum neutral mapping without using the C-bit. This is enabled by configuration of a mapping. The checksum neutral function has been split into ila_csum_do_neutral_fmt and ila_csum_do_neutral_nofmt. The former handles the C-bit and includes it in the adjustment value. The latter just sets the adjustment value on the locator diff only. Added configuration for checksum neutral map aut in ila_lwt and ila_xlat. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 1 + net/ipv6/ila/ila_common.c | 65 ++++++++++++++++++++++++++++------------------- net/ipv6/ila/ila_lwt.c | 29 +++++++++++---------- net/ipv6/ila/ila_xlat.c | 10 +++++--- 4 files changed, 61 insertions(+), 44 deletions(-) (limited to 'net/ipv6') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index f54853288f99..0744881dcef3 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -41,6 +41,7 @@ enum { ILA_CSUM_ADJUST_TRANSPORT, ILA_CSUM_NEUTRAL_MAP, ILA_CSUM_NO_ACTION, + ILA_CSUM_NEUTRAL_MAP_AUTO, }; #endif /* _UAPI_LINUX_ILA_H */ diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index f1d9248d8b86..8c88ecf29b93 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -37,8 +37,8 @@ static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); } -static void ila_csum_do_neutral(struct ila_addr *iaddr, - struct ila_params *p) +static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr, + struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; @@ -60,13 +60,23 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, iaddr->ident.csum_neutral ^= 1; } -static void ila_csum_adjust_transport(struct sk_buff *skb, +static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr, struct ila_params *p) { + __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff; - struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + diff = get_csum_diff_iaddr(iaddr, p); + + *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); +} + +static void ila_csum_adjust_transport(struct sk_buff *skb, + struct ila_params *p) +{ size_t nhoff = sizeof(struct ipv6hdr); + struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum diff; switch (ip6h->nexthdr) { case NEXTHDR_TCP: @@ -105,36 +115,39 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, } break; } - - /* Now change destination address */ - iaddr->loc = p->locator; } void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, - bool set_csum_neutral) + bool sir2ila) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); - /* First deal with the transport checksum */ - if (ila_csum_neutral_set(iaddr->ident)) { - /* C-bit is set in the locator indicating that this - * is a locator being translated to a SIR address. - * Perform (receiver) checksum-neutral translation. - */ - if (!set_csum_neutral) - ila_csum_do_neutral(iaddr, p); - } else { - switch (p->csum_mode) { - case ILA_CSUM_ADJUST_TRANSPORT: - ila_csum_adjust_transport(skb, p); - break; - case ILA_CSUM_NEUTRAL_MAP: - ila_csum_do_neutral(iaddr, p); - break; - case ILA_CSUM_NO_ACTION: + switch (p->csum_mode) { + case ILA_CSUM_ADJUST_TRANSPORT: + ila_csum_adjust_transport(skb, p); + break; + case ILA_CSUM_NEUTRAL_MAP: + if (sir2ila) { + if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) { + /* Checksum flag should never be + * set in a formatted SIR address. + */ + break; + } + } else if (!ila_csum_neutral_set(iaddr->ident)) { + /* ILA to SIR translation and C-bit isn't + * set so we're good. + */ break; } + ila_csum_do_neutral_fmt(iaddr, p); + break; + case ILA_CSUM_NEUTRAL_MAP_AUTO: + ila_csum_do_neutral_nofmt(iaddr, p); + break; + case ILA_CSUM_NO_ACTION: + break; } /* Now change destination address */ diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 696281b4bca2..104af07d83a6 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -127,6 +127,7 @@ static int ila_build_state(struct nlattr *nla, struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; + u8 csum_mode = ILA_CSUM_NO_ACTION; int ret; if (family != AF_INET6) @@ -139,15 +140,6 @@ static int ila_build_state(struct nlattr *nla, return -EINVAL; } - iaddr = (struct ila_addr *)&cfg6->fc_dst; - - if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) { - /* Don't allow translation for a non-ILA address or checksum - * neutral flag to be set. - */ - return -EINVAL; - } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; @@ -155,6 +147,19 @@ static int ila_build_state(struct nlattr *nla, if (!tb[ILA_ATTR_LOCATOR]) return -EINVAL; + iaddr = (struct ila_addr *)&cfg6->fc_dst; + + if (tb[ILA_ATTR_CSUM_MODE]) + csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); + + if (csum_mode == ILA_CSUM_NEUTRAL_MAP && + ila_csum_neutral_set(iaddr->ident)) { + /* Don't allow translation if checksum neutral bit is + * configured and it's set in the SIR address. + */ + return -EINVAL; + } + newts = lwtunnel_state_alloc(sizeof(*ilwt)); if (!newts) return -ENOMEM; @@ -168,17 +173,13 @@ static int ila_build_state(struct nlattr *nla, p = ila_params_lwtunnel(newts); + p->csum_mode = csum_mode; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we * know both the old locator and the new one. */ p->locator_match = iaddr->loc; - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, (__be32 *)&p->locator); - - if (tb[ILA_ATTR_CSUM_MODE]) - p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); ila_init_saved_csum(p); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 3123b9de91b5..213259629e66 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -138,6 +138,8 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[ILA_ATTR_CSUM_MODE]) xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); + else + xp->ip.csum_mode = ILA_CSUM_NO_ACTION; if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -198,7 +200,7 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral); +static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, @@ -396,7 +398,7 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || - nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) + nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) return -1; return 0; @@ -607,7 +609,7 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) +static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -626,7 +628,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - ila_update_ipv6_locator(skb, &ila->xp.ip, set_csum_neutral); + ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); -- cgit v1.2.3 From 70d5aef48a421a68bd9d1bf8f8267af406681580 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:24 -0800 Subject: ila: allow configuration of identifier type Allow identifier to be explicitly configured for a mapping. This can either be one of the identifier types specified in the ILA draft or a value of ILA_ATYPE_USE_FORMAT which means the identifier type is inferred from the identifier type field. If a value other than ILA_ATYPE_USE_FORMAT is set for a mapping then it is assumed that the identifier type field is not present in an identifier. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 13 ++++++++++++ net/ipv6/ila/ila.h | 12 +----------- net/ipv6/ila/ila_lwt.c | 51 +++++++++++++++++++++++++++++++++++++++++------- net/ipv6/ila/ila_xlat.c | 18 ++++++++++++----- 4 files changed, 71 insertions(+), 23 deletions(-) (limited to 'net/ipv6') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 0744881dcef3..8353c78a7781 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -17,6 +17,7 @@ enum { ILA_ATTR_DIR, /* u32 */ ILA_ATTR_PAD, ILA_ATTR_CSUM_MODE, /* u8 */ + ILA_ATTR_IDENT_TYPE, /* u8 */ __ILA_ATTR_MAX, }; @@ -44,4 +45,16 @@ enum { ILA_CSUM_NEUTRAL_MAP_AUTO, }; +enum { + ILA_ATYPE_IID = 0, + ILA_ATYPE_LUID, + ILA_ATYPE_VIRT_V4, + ILA_ATYPE_VIRT_UNI_V6, + ILA_ATYPE_VIRT_MULTI_V6, + ILA_ATYPE_NONLOCAL_ADDR, + ILA_ATYPE_RSVD_1, + ILA_ATYPE_RSVD_2, + + ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */ +}; #endif /* _UAPI_LINUX_ILA_H */ diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index e0170f62bc39..3c7a11b62334 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -55,17 +55,6 @@ struct ila_identifier { }; }; -enum { - ILA_ATYPE_IID = 0, - ILA_ATYPE_LUID, - ILA_ATYPE_VIRT_V4, - ILA_ATYPE_VIRT_UNI_V6, - ILA_ATYPE_VIRT_MULTI_V6, - ILA_ATYPE_RSVD_1, - ILA_ATYPE_RSVD_2, - ILA_ATYPE_RSVD_3, -}; - #define CSUM_NEUTRAL_FLAG htonl(0x10000000) struct ila_addr { @@ -93,6 +82,7 @@ struct ila_params { struct ila_locator locator_match; __wsum csum_diff; u8 csum_mode; + u8 ident_type; }; static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 104af07d83a6..4b97d573f223 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -114,6 +114,7 @@ drop: static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct nlattr *nla, @@ -127,19 +128,14 @@ static int ila_build_state(struct nlattr *nla, struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; + u8 ident_type = ILA_ATYPE_USE_FORMAT; u8 csum_mode = ILA_CSUM_NO_ACTION; + u8 eff_ident_type; int ret; if (family != AF_INET6) return -EINVAL; - if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { - /* Need to have full locator and at least type field - * included in destination - */ - return -EINVAL; - } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; @@ -149,6 +145,41 @@ static int ila_build_state(struct nlattr *nla, iaddr = (struct ila_addr *)&cfg6->fc_dst; + if (tb[ILA_ATTR_IDENT_TYPE]) + ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]); + + if (ident_type == ILA_ATYPE_USE_FORMAT) { + /* Infer identifier type from type field in formatted + * identifier. + */ + + if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { + /* Need to have full locator and at least type field + * included in destination + */ + return -EINVAL; + } + + eff_ident_type = iaddr->ident.type; + } else { + eff_ident_type = ident_type; + } + + switch (eff_ident_type) { + case ILA_ATYPE_IID: + /* Don't allow ILA for IID type */ + return -EINVAL; + case ILA_ATYPE_LUID: + break; + case ILA_ATYPE_VIRT_V4: + case ILA_ATYPE_VIRT_UNI_V6: + case ILA_ATYPE_VIRT_MULTI_V6: + case ILA_ATYPE_NONLOCAL_ADDR: + /* These ILA formats are not supported yet. */ + default: + return -EINVAL; + } + if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); @@ -174,6 +205,7 @@ static int ila_build_state(struct nlattr *nla, p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; + p->ident_type = ident_type; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we @@ -208,9 +240,13 @@ static int ila_fill_encap_info(struct sk_buff *skb, if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -221,6 +257,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ 0; } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 213259629e66..6eb5e68f112a 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -121,6 +121,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, @@ -141,6 +142,12 @@ static int parse_nl_config(struct genl_info *info, else xp->ip.csum_mode = ILA_CSUM_NO_ACTION; + if (info->attrs[ILA_ATTR_IDENT_TYPE]) + xp->ip.ident_type = nla_get_u8( + info->attrs[ILA_ATTR_IDENT_TYPE]); + else + xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; + if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); @@ -398,7 +405,8 @@ static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || - nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) + nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || + nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; @@ -619,10 +627,10 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) /* Assumes skb contains a valid IPv6 header that is pulled */ - if (!ila_addr_is_ila(iaddr)) { - /* Type indicates this is not an ILA address */ - return 0; - } + /* No check here that ILA type in the mapping matches what is in the + * address. We assume that whatever sender gaves us can be translated. + * The checksum mode however is relevant. + */ rcu_read_lock(); -- cgit v1.2.3 From fddb231ebe647749782a9ebf11106a81f7168ba7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 5 Nov 2017 15:58:25 -0800 Subject: ila: Add a hook type for LWT routes In LWT tunnels both an input and output route method is defined. If both of these are executed in the same path then double translation happens and the effect is not correct. This patch adds a new attribute that indicates the hook type. Two values are defined for route output and route output. ILA translation is only done for the one that is set. The default is to enable ILA on route output. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 7 +++++++ net/ipv6/ila/ila_lwt.c | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 8353c78a7781..483b77af4eb8 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -18,6 +18,7 @@ enum { ILA_ATTR_PAD, ILA_ATTR_CSUM_MODE, /* u8 */ ILA_ATTR_IDENT_TYPE, /* u8 */ + ILA_ATTR_HOOK_TYPE, /* u8 */ __ILA_ATTR_MAX, }; @@ -57,4 +58,10 @@ enum { ILA_ATYPE_USE_FORMAT = 32, /* Get type from type field in identifier */ }; + +enum { + ILA_HOOK_ROUTE_OUTPUT, + ILA_HOOK_ROUTE_INPUT, +}; + #endif /* _UAPI_LINUX_ILA_H */ diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 4b97d573f223..3d56a2fb6f86 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -20,6 +20,7 @@ struct ila_lwt { struct ila_params p; struct dst_cache dst_cache; u32 connected : 1; + u32 lwt_output : 1; }; static inline struct ila_lwt *ila_lwt_lwtunnel( @@ -45,8 +46,10 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate), - true); + if (ilwt->lwt_output) + ila_update_ipv6_locator(skb, + ila_params_lwtunnel(orig_dst->lwtstate), + true); if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) { /* Already have a next hop address in route, no need for @@ -98,11 +101,15 @@ drop: static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); + struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate); if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); + if (!ilwt->lwt_output) + ila_update_ipv6_locator(skb, + ila_params_lwtunnel(dst->lwtstate), + false); return dst->lwtstate->orig_input(skb); @@ -115,6 +122,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, + [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct nlattr *nla, @@ -129,7 +137,9 @@ static int ila_build_state(struct nlattr *nla, const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; u8 ident_type = ILA_ATYPE_USE_FORMAT; + u8 hook_type = ILA_HOOK_ROUTE_OUTPUT; u8 csum_mode = ILA_CSUM_NO_ACTION; + bool lwt_output = true; u8 eff_ident_type; int ret; @@ -180,6 +190,20 @@ static int ila_build_state(struct nlattr *nla, return -EINVAL; } + if (tb[ILA_ATTR_HOOK_TYPE]) + hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]); + + switch (hook_type) { + case ILA_HOOK_ROUTE_OUTPUT: + lwt_output = true; + break; + case ILA_HOOK_ROUTE_INPUT: + lwt_output = false; + break; + default: + return -EINVAL; + } + if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); @@ -202,6 +226,8 @@ static int ila_build_state(struct nlattr *nla, return ret; } + ilwt->lwt_output = !!lwt_output; + p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; @@ -236,6 +262,7 @@ static int ila_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ila_params *p = ila_params_lwtunnel(lwtstate); + struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate); if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) @@ -247,6 +274,11 @@ static int ila_fill_encap_info(struct sk_buff *skb, if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, + ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT : + ILA_HOOK_ROUTE_INPUT)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -258,6 +290,7 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */ 0; } -- cgit v1.2.3 From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:28 -0800 Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem Note that when a new netns is created, it inherits its sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns. This change is needed so that we can refine TCP rcvbuf autotuning, to take RTT into consideration. Signed-off-by: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 -- net/ipv4/sysctl_net_ipv4.c | 32 ++++++++++++++++---------------- net/ipv4/tcp.c | 21 ++++++++------------- net/ipv4/tcp_input.c | 14 ++++++++------ net/ipv4/tcp_ipv4.c | 13 ++++++++++--- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 8 files changed, 47 insertions(+), 43 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 379550f8124a..5e12975fc658 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -155,6 +155,8 @@ struct netns_ipv4 { int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; + int sysctl_tcp_wmem[3]; + int sysctl_tcp_rmem[3]; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index babfd4da1515..2f2c69ad31b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; -extern int sysctl_tcp_wmem[3]; -extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a82b44038308..ef0ff3357a44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -440,22 +440,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { - .procname = "tcp_wmem", - .data = &sysctl_tcp_wmem, - .maxlen = sizeof(sysctl_tcp_wmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_rmem", - .data = &sysctl_tcp_rmem, - .maxlen = sizeof(sysctl_tcp_rmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, @@ -1164,6 +1148,22 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &thousand, }, + { + .procname = "tcp_wmem", + .data = &init_net.ipv4.sysctl_tcp_wmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_rmem", + .data = &init_net.ipv4.sysctl_tcp_rmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4cb19ed4628..bc71a27d5ad9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); @@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } @@ -3636,13 +3631,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54ee09cbcf7..9ceaa1fdc3ab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; - int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1; + int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); + sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + net->ipv4.sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk) while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + rcvbuf = min(rcvwin / tp->advmss * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0162c577bb9c..1eac84b8044e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2409,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; - + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a9d917e4dad5..9b98d35aa0d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e2529958b52..6bb98c93edfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = { .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, -- cgit v1.2.3 From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Tue, 7 Nov 2017 21:52:09 -0800 Subject: net: ipv6: sysctl to specify IPv6 ND traffic class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti Signed-off-by: Erik Kline Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 11 +++++++++++ net/ipv6/ndisc.c | 9 ++++++++- 5 files changed, 30 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 54410a1d4065..d8676dda7fa6 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1732,6 +1732,15 @@ ndisc_notify - BOOLEAN 1 - Generate unsolicited neighbour advertisements when device is brought up or hardware address changes. +ndisc_tclass - INTEGER + The IPv6 Traffic Class to use by default when sending IPv6 Neighbor + Discovery (Router Solicitation, Router Advertisement, Neighbor + Solicitation, Neighbor Advertisement, Redirect) messages. + These 8 bits can be interpreted as 6 high order bits holding the DSCP + value and 2 low order bits representing ECN (which you probably want + to leave cleared). + 0 - (default) + mldv1_unsolicited_report_interval - INTEGER The interval in milliseconds in which the next unsolicited MLDv1 report retransmit will take place. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ea04ca024f0d..cb18c6290ca8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -73,6 +73,7 @@ struct ipv6_devconf { __u32 enhanced_dad; __u32 addr_gen_mode; __s32 disable_policy; + __s32 ndisc_tclass; struct ctl_table_header *sysctl_header; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index b22a9c4e1b12..9c0f4a92bcff 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -186,6 +186,7 @@ enum { DEVCONF_ADDR_GEN_MODE, DEVCONF_DISABLE_POLICY, DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, + DEVCONF_NDISC_TCLASS, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6233e06fa35c..a6dffd65eb9d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5059,6 +5059,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; + array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; } static inline size_t inet6_ifla6_size(void) @@ -5986,6 +5987,7 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, } static int minus_one = -1; +static const int zero = 0; static const int one = 1; static const int two_five_five = 255; @@ -6356,6 +6358,15 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = addrconf_sysctl_disable_policy, }, + { + .procname = "ndisc_tclass", + .data = &ipv6_devconf.ndisc_tclass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&zero, + .extra2 = (void *)&two_five_five, + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f9c3ffe04382..b3cea200c85e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -427,12 +427,19 @@ static void ip6_nd_hdr(struct sk_buff *skb, int hop_limit, int len) { struct ipv6hdr *hdr; + struct inet6_dev *idev; + unsigned tclass; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + tclass = idev ? idev->cnf.ndisc_tclass : 0; + rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, 0, 0); + ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; -- cgit v1.2.3 From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 10 Nov 2017 14:03:51 -0800 Subject: net: Remove unused skb_shared_info member ip6_frag_id was only used by UFO, which has been removed. ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no in-tree callers. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - include/net/ipv6.h | 1 - net/ipv6/output_core.c | 31 ------------------------------- 3 files changed, 33 deletions(-) (limited to 'net/ipv6') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 57d712671081..54fe91183a8e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -500,7 +500,6 @@ struct skb_shared_info { struct skb_shared_hwtstamps hwtstamps; unsigned int gso_type; u32 tskey; - __be32 ip6_frag_id; /* * Warning : all fields before dataref are cleared in __alloc_skb() diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fb6d67012de6..ec14f0d5a3a1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -767,7 +767,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index a338bbc33cf3..4a7e5ffa5108 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -31,37 +31,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, return id; } -/* This function exists only for tap drivers that must support broken - * clients requesting UFO without specifying an IPv6 fragment ID. - * - * This is similar to ipv6_select_ident() but we use an independent hash - * seed to limit information leakage. - * - * The network header must be set before calling this. - */ -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) -{ - static u32 ip6_proxy_idents_hashrnd __read_mostly; - struct in6_addr buf[2]; - struct in6_addr *addrs; - u32 id; - - addrs = skb_header_pointer(skb, - skb_network_offset(skb) + - offsetof(struct ipv6hdr, saddr), - sizeof(buf), buf); - if (!addrs) - return; - - net_get_random_once(&ip6_proxy_idents_hashrnd, - sizeof(ip6_proxy_idents_hashrnd)); - - id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, - &addrs[1], &addrs[0]); - skb_shinfo(skb)->ip6_frag_id = htonl(id); -} -EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); - __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) -- cgit v1.2.3 From 338d182fa542b3ca05456ad1ce9cebe6580083b1 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Wed, 8 Nov 2017 11:23:46 -0800 Subject: ipv6: try not to take rtnl_lock in ip6mr_sk_done Avoid traversing the list of mr6_tables (which requires the rtnl_lock) in ip6mr_sk_done(), when we know in advance that a match will not be found. This can happen when rawv6_close()/ip6mr_sk_done() is invoked on non-mroute6 sockets. This patch helps reduce rtnl_lock contention when destroying a large number of net namespaces, each having a non-mroute6 raw socket. v2: same patch, only fixed subject line and expanded comment. Signed-off-by: Francesco Ruggeri Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 59fad81e5f7a..9c24b85949c1 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1617,6 +1617,10 @@ int ip6mr_sk_done(struct sock *sk) struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return err; + rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == mrt->mroute6_sk) { -- cgit v1.2.3 From 929fc0327569aa745c9c3cb68a213c22fad3f3f9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:49 +0800 Subject: ip6_gre: add the process for redirect in ip6gre_err This patch is to add redirect icmp packet process for ip6gre by calling ip6_redirect() in ip6gre_err(), as in vti6_err. Prior to this patch, there's even no route cache generated after receiving redirect. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3e10c51e7e0c..0684d0ccaaa5 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -369,6 +369,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); const struct gre_base_hdr *greh; const struct ipv6hdr *ipv6h; int grehlen = sizeof(*greh); @@ -442,6 +443,10 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; return; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + return; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) -- cgit v1.2.3 From fe1a4ca0a2b7884672661284666a0b8c183b0b1e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:50 +0800 Subject: ip6_gre: process toobig in a better way Now ip6gre processes toobig icmp packet by setting gre dev's mtu in ip6gre_err, which would cause few things not good: - It couldn't set mtu with dev_set_mtu due to it's not in user context, which causes route cache and idev->cnf.mtu6 not to be updated. - It has to update sk dst pmtu in tx path according to gredev->mtu for ip6gre, while it updates pmtu again according to lower dst pmtu in ip6_tnl_xmit. - To change dev->mtu by toobig icmp packet is not a good idea, it should only work on pmtu. This patch is to process toobig by updating the lower dst's pmtu, as later sk dst pmtu will be updated in ip6_tnl_xmit, the same way as in ip4gre. Note that gre dev's mtu will not be updated any more, it doesn't make any sense to change dev's mtu after receiving a toobig packet. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 0684d0ccaaa5..b90bad7a4e56 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -403,9 +403,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; switch (type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -436,12 +435,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } return; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset - t->tun_hlen; - if (t->dev->type == ARPHRD_ETHER) - mtu -= ETH_HLEN; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); return; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, @@ -508,7 +502,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - struct dst_entry *dst = skb_dst(skb); __be16 protocol; if (dev->type == ARPHRD_ETHER) @@ -527,10 +520,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - /* TooBig packet may have updated dst->dev's mtu */ - if (dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } -- cgit v1.2.3 From 383c1f88759bba7f387b7b705c31081e5df40b38 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:51 +0800 Subject: ip6_tunnel: add the process for redirect in ip6_tnl_err The same process for redirect in "ip6_gre: add the process for redirect in ip6gre_err" is needed by ip4ip6 and ip6ip6 as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 439d65f7e094..a1f704c799d9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -471,15 +471,16 @@ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; - struct ip6_tnl *t; - int rel_msg = 0; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; - u8 tproto; __u32 rel_info = 0; - __u16 len; + struct ip6_tnl *t; int err = -ENOENT; + int rel_msg = 0; + u8 tproto; + __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further @@ -543,6 +544,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + break; } *type = rel_type; -- cgit v1.2.3 From b00f543240b9423eda73ad06c57becdd5478bc26 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:52 +0800 Subject: ip6_tunnel: process toobig in a better way The same improvement in "ip6_gre: process toobig in a better way" is needed by ip4ip6 and ip6ip6 as well. Note that ip4ip6 and ip6ip6 will also update sk dst pmtu in their err_handlers. Like I said before, gre6 could not do this as it's inner proto is not certain. But for all of them, sk dst pmtu will be updated in tx path if in need. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a1f704c799d9..7e9e205b69d9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -498,9 +498,8 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, err = 0; switch (*type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 mtu, teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -531,11 +530,11 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: + ip6_update_pmtu(skb, net, htonl(*info), 0, 0, + sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; - len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; -- cgit v1.2.3 From 77552cfa39c48e695c39d0553afc8c6018e411ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:53 +0800 Subject: ip6_tunnel: clean up ip4ip6 and ip6ip6's err_handlers This patch is to remove some useless codes of redirect and fix some indents on ip4ip6 and ip6ip6's err_handlers. Note that redirect icmp packet is already processed in ip6_tnl_err, the old redirect codes in ip4ip6_err actually never worked even before this patch. Besides, there's no need to send redirect to user's sk, it's for lower dst, so just remove it in this patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 7e9e205b69d9..00882fdb1223 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -563,13 +563,12 @@ static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - int rel_msg = 0; - u8 rel_type = type; - u8 rel_code = code; __u32 rel_info = ntohl(info); - int err; - struct sk_buff *skb2; const struct iphdr *eiph; + struct sk_buff *skb2; + int err, rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; @@ -594,10 +593,6 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; - case NDISC_REDIRECT: - rel_type = ICMP_REDIRECT; - rel_code = ICMP_REDIR_HOST; - /* fall through */ default: return 0; } @@ -616,33 +611,26 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, eiph = ip_hdr(skb2); /* Try to guess incoming interface */ - rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, - eiph->saddr, 0, - 0, 0, - IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, + 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; + ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { - ip_rt_put(rt); - rt = NULL; rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, - eiph->daddr, eiph->saddr, - 0, 0, - IPPROTO_IPIP, - RT_TOS(eiph->tos), 0); - if (IS_ERR(rt) || - rt->dst.dev->type != ARPHRD_TUNNEL) { + eiph->daddr, eiph->saddr, 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { - ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) @@ -654,10 +642,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info); + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, + rel_info); } - if (rel_type == ICMP_REDIRECT) - skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -670,11 +657,10 @@ static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - int rel_msg = 0; + __u32 rel_info = ntohl(info); + int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; - __u32 rel_info = ntohl(info); - int err; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); -- cgit v1.2.3 From baeb0dbbb56598a2ccd98b56e0da3e9d22869112 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sun, 12 Nov 2017 22:34:03 +0300 Subject: xfrm6_tunnel: exit_net cleanup check added Be sure that spi_byaddr and spi_byspi arrays initialized in net_init hook were return to initial state Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/ipv6/xfrm6_tunnel.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 4e438bc7ee87..f85f0d7480ac 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -338,6 +338,14 @@ static int __net_init xfrm6_tunnel_net_init(struct net *net) static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { -- cgit v1.2.3 From 094009531612246d9e13f9e0c3ae2205d7f63a0a Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 14 Nov 2017 14:21:32 +0100 Subject: ipv6: set all.accept_dad to 0 by default With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag is also taken into account (default value is 1). If either global or per-interface flag is non-zero, DAD will be enabled on a given interface. This is not backward compatible: before those patches, the user could disable DAD just by setting the per-interface flag to 0. Now, the user instead needs to set both flags to 0 to actually disable DAD. Restore the previous behaviour by setting the default for the global 'accept_dad' flag to 0. This way, DAD is still enabled by default, as per-interface flags are set to 1 on device creation, but setting them to 0 is enough to disable DAD on a given interface. - Before 35e015e1f57a7 and a2d3f3e33853: global per-interface DAD enabled [default] 1 1 yes X 0 no X 1 yes - After 35e015e1f577 and a2d3f3e33853: global per-interface DAD enabled [default] 1 1 yes 0 0 no 0 1 yes 1 0 yes - After this fix: global per-interface DAD enabled 1 1 yes 0 0 no [default] 0 1 yes 1 0 yes Fixes: 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers") Fixes: a2d3f3e33853 ("ipv6: fix net.ipv6.conf.all.accept_dad behaviour for real") CC: Stefano Brivio CC: Matteo Croce CC: Erik Kline Signed-off-by: Nicolas Dichtel Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a6dffd65eb9d..a0ae1c9d37df 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, - .accept_dad = 1, + .accept_dad = 0, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, .stable_secret = { -- cgit v1.2.3 From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 6 ++-- net/ipv4/fib_semantics.c | 4 +-- net/ipv4/sysctl_net_ipv4.c | 19 +++++++----- net/ipv4/tcp_cong.c | 76 ++++++++++++++++++++++------------------------ net/ipv4/tcp_ipv4.c | 9 ++++++ net/ipv6/route.c | 3 +- 7 files changed, 64 insertions(+), 54 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5e12975fc658..44668c29701a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; unsigned int sysctl_tcp_fastopen_blackhole_timeout; diff --git a/include/net/tcp.h b/include/net/tcp.h index ed71511e67a6..35cc7d0d3d47 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); -int tcp_set_default_congestion_control(const char *name); -void tcp_get_default_congestion_control(char *name); +int tcp_set_default_congestion_control(struct net *net, const char *name); +void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); @@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 589caaa90613..f04d944f8abe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { val = nla_get_u32(nla); } @@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef0ff3357a44..93e172118a94 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -763,6 +759,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one }, #endif + { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..bc6c02f16243 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + const struct tcp_congestion_ops *prev; + int ret; - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eac84b8044e..c6bc0c4d19c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 70d9659fc1e9..05eb7bc36156 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2378,6 +2378,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { -- cgit v1.2.3