summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/net/dst.h2
-rw-r--r--include/net/ip6_fib.h79
-rw-r--r--include/net/ip6_route.h5
-rw-r--r--net/ipv6/addrconf.c17
-rw-r--r--net/ipv6/ip6_fib.c645
-rw-r--r--net/ipv6/route.c901
6 files changed, 1179 insertions, 470 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index 06a6765da074..204c19e25456 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -101,7 +101,7 @@ struct dst_entry {
union {
struct dst_entry *next;
struct rtable __rcu *rt_next;
- struct rt6_info *rt6_next;
+ struct rt6_info __rcu *rt6_next;
struct dn_route __rcu *dn_next;
};
};
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d060d711a624..10c913816032 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -29,6 +29,14 @@
#define FIB6_TABLE_HASHSZ 1
#endif
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) pr_debug(x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
struct rt6_info;
struct fib6_config {
@@ -60,25 +68,30 @@ struct fib6_config {
};
struct fib6_node {
- struct fib6_node *parent;
- struct fib6_node *left;
- struct fib6_node *right;
+ struct fib6_node __rcu *parent;
+ struct fib6_node __rcu *left;
+ struct fib6_node __rcu *right;
#ifdef CONFIG_IPV6_SUBTREES
- struct fib6_node *subtree;
+ struct fib6_node __rcu *subtree;
#endif
- struct rt6_info *leaf;
+ struct rt6_info __rcu *leaf;
__u16 fn_bit; /* bit key */
__u16 fn_flags;
int fn_sernum;
- struct rt6_info *rr_ptr;
+ struct rt6_info __rcu *rr_ptr;
struct rcu_head rcu;
};
+struct fib6_gc_args {
+ int timeout;
+ int more;
+};
+
#ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn) NULL
#else
-#define FIB6_SUBTREE(fn) ((fn)->subtree)
+#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1))
#endif
struct mx6_config {
@@ -98,6 +111,22 @@ struct rt6key {
struct fib6_table;
+struct rt6_exception_bucket {
+ struct hlist_head chain;
+ int depth;
+};
+
+struct rt6_exception {
+ struct hlist_node hlist;
+ struct rt6_info *rt6i;
+ unsigned long stamp;
+ struct rcu_head rcu;
+};
+
+#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
+#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
+#define FIB6_MAX_DEPTH 5
+
struct rt6_info {
struct dst_entry dst;
@@ -134,14 +163,25 @@ struct rt6_info {
struct inet6_dev *rt6i_idev;
struct rt6_info * __percpu *rt6i_pcpu;
+ struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
u32 rt6i_metric;
u32 rt6i_pmtu;
/* more non-fragment space at head required */
unsigned short rt6i_nfheader_len;
u8 rt6i_protocol;
+ u8 exception_bucket_flushed:1,
+ unused:7;
};
+#define for_each_fib6_node_rt_rcu(fn) \
+ for (rt = rcu_dereference((fn)->leaf); rt; \
+ rt = rcu_dereference(rt->dst.rt6_next))
+
+#define for_each_fib6_walker_rt(w) \
+ for (rt = (w)->leaf; rt; \
+ rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
+
static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
{
return ((struct rt6_info *)dst)->rt6i_idev;
@@ -188,6 +228,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
if (fn) {
*cookie = fn->fn_sernum;
+ /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */
+ smp_rmb();
status = true;
}
@@ -248,7 +290,6 @@ struct fib6_walker {
struct fib6_node *root, *node;
struct rt6_info *leaf;
enum fib6_walk_state state;
- bool prune;
unsigned int skip;
unsigned int count;
int (*func)(struct fib6_walker *);
@@ -256,12 +297,15 @@ struct fib6_walker {
};
struct rt6_statistics {
- __u32 fib_nodes;
- __u32 fib_route_nodes;
- __u32 fib_rt_alloc; /* permanent routes */
- __u32 fib_rt_entries; /* rt entries in table */
- __u32 fib_rt_cache; /* cache routes */
- __u32 fib_discarded_routes;
+ __u32 fib_nodes; /* all fib6 nodes */
+ __u32 fib_route_nodes; /* intermediate nodes */
+ __u32 fib_rt_entries; /* rt entries in fib table */
+ __u32 fib_rt_cache; /* cached rt entries in exception table */
+ __u32 fib_discarded_routes; /* total number of routes delete */
+
+ /* The following stats are not protected by any lock */
+ atomic_t fib_rt_alloc; /* total number of routes alloced */
+ atomic_t fib_rt_uncache; /* rt entries in uncached list */
};
#define RTN_TL_ROOT 0x0001
@@ -277,7 +321,7 @@ struct rt6_statistics {
struct fib6_table {
struct hlist_node tb6_hlist;
u32 tb6_id;
- rwlock_t tb6_lock;
+ spinlock_t tb6_lock;
struct fib6_node tb6_root;
struct inet_peer_base tb6_peers;
unsigned int flags;
@@ -325,7 +369,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root,
struct fib6_node *fib6_locate(struct fib6_node *root,
const struct in6_addr *daddr, int dst_len,
- const struct in6_addr *saddr, int src_len);
+ const struct in6_addr *saddr, int src_len,
+ bool exact_match);
void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
void *arg);
@@ -358,6 +403,8 @@ void __net_exit fib6_notifier_exit(struct net *net);
unsigned int fib6_tables_seq_read(struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb);
+void fib6_update_sernum(struct rt6_info *rt);
+
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
int fib6_rules_init(void);
void fib6_rules_cleanup(void);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index ee96f402cb75..a0087fb9864b 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -95,6 +95,11 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
int ip6_ins_rt(struct rt6_info *);
int ip6_del_rt(struct rt6_info *);
+void rt6_flush_exceptions(struct rt6_info *rt);
+int rt6_remove_exception_rt(struct rt6_info *rt);
+void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
+ unsigned long now);
+
static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
const struct in6_addr *daddr,
unsigned int prefs,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 837418ff2d4b..9854d93e45bb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2321,24 +2321,24 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
if (!table)
return NULL;
- read_lock_bh(&table->tb6_lock);
- fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
+ rcu_read_lock();
+ fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
if (!fn)
goto out;
- noflags |= RTF_CACHE;
- for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ for_each_fib6_node_rt_rcu(fn) {
if (rt->dst.dev->ifindex != dev->ifindex)
continue;
if ((rt->rt6i_flags & flags) != flags)
continue;
if ((rt->rt6i_flags & noflags) != 0)
continue;
- dst_hold(&rt->dst);
+ if (!dst_hold_safe(&rt->dst))
+ rt = NULL;
break;
}
out:
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
return rt;
}
@@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
spin_lock(&ifa->lock);
if (ifa->rt) {
struct rt6_info *rt = ifa->rt;
- struct fib6_table *table = rt->rt6i_table;
int cpu;
- read_lock(&table->tb6_lock);
+ rcu_read_lock();
addrconf_set_nopolicy(ifa->rt, val);
if (rt->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
@@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
addrconf_set_nopolicy(*rtp, val);
}
}
- read_unlock(&table->tb6_lock);
+ rcu_read_unlock();
}
spin_unlock(&ifa->lock);
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e5308d7cbd75..52a29ba32928 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -38,14 +38,6 @@
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
-#define RT6_DEBUG 2
-
-#if RT6_DEBUG >= 3
-#define RT6_TRACE(x...) pr_debug(x)
-#else
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
-
static struct kmem_cache *fib6_node_kmem __read_mostly;
struct fib6_cleaner {
@@ -62,9 +54,12 @@ struct fib6_cleaner {
#define FWS_INIT FWS_L
#endif
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
-static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
+static struct rt6_info *fib6_find_prefix(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn);
static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);
@@ -110,6 +105,20 @@ enum {
FIB6_NO_SERNUM_CHANGE = 0,
};
+void fib6_update_sernum(struct rt6_info *rt)
+{
+ struct fib6_table *table = rt->rt6i_table;
+ struct net *net = dev_net(rt->dst.dev);
+ struct fib6_node *fn;
+
+ spin_lock_bh(&table->tb6_lock);
+ fn = rcu_dereference_protected(rt->rt6i_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (fn)
+ fn->fn_sernum = fib6_new_sernum(net);
+ spin_unlock_bh(&table->tb6_lock);
+}
+
/*
* Auxiliary address test functions for the radix tree.
*
@@ -140,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
-static struct fib6_node *node_alloc(void)
+static struct fib6_node *node_alloc(struct net *net)
{
struct fib6_node *fn;
fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+ if (fn)
+ net->ipv6.rt6_stats->fib_nodes++;
return fn;
}
-static void node_free_immediate(struct fib6_node *fn)
+static void node_free_immediate(struct net *net, struct fib6_node *fn)
{
kmem_cache_free(fib6_node_kmem, fn);
+ net->ipv6.rt6_stats->fib_nodes--;
}
static void node_free_rcu(struct rcu_head *head)
@@ -161,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head)
kmem_cache_free(fib6_node_kmem, fn);
}
-static void node_free(struct fib6_node *fn)
+static void node_free(struct net *net, struct fib6_node *fn)
{
call_rcu(&fn->rcu, node_free_rcu);
+ net->ipv6.rt6_stats->fib_nodes--;
}
void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
@@ -185,9 +198,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
*ppcpu_rt = NULL;
}
}
-
- free_percpu(non_pcpu_rt->rt6i_pcpu);
- non_pcpu_rt->rt6i_pcpu = NULL;
}
EXPORT_SYMBOL_GPL(rt6_free_pcpu);
@@ -205,8 +215,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
* Initialize table lock at a single place to give lockdep a key,
* tables aren't visible prior to being linked to the list.
*/
- rwlock_init(&tb->tb6_lock);
-
+ spin_lock_init(&tb->tb6_lock);
h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
/*
@@ -225,7 +234,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
table = kzalloc(sizeof(*table), GFP_ATOMIC);
if (table) {
table->tb6_id = id;
- table->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ rcu_assign_pointer(table->tb6_root.leaf,
+ net->ipv6.ip6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&table->tb6_peers);
}
@@ -322,11 +332,8 @@ unsigned int fib6_tables_seq_read(struct net *net)
struct hlist_head *head = &net->ipv6.fib_table_hash[h];
struct fib6_table *tb;
- hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
- read_lock_bh(&tb->tb6_lock);
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist)
fib_seq += tb->fib_seq;
- read_unlock_bh(&tb->tb6_lock);
- }
}
rcu_read_unlock();
@@ -372,7 +379,7 @@ static int fib6_node_dump(struct fib6_walker *w)
{
struct rt6_info *rt;
- for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+ for_each_fib6_walker_rt(w)
fib6_rt_dump(rt, w->args);
w->leaf = NULL;
return 0;
@@ -382,9 +389,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb,
struct fib6_walker *w)
{
w->root = &tb->tb6_root;
- read_lock_bh(&tb->tb6_lock);
+ spin_lock_bh(&tb->tb6_lock);
fib6_walk(net, w);
- read_unlock_bh(&tb->tb6_lock);
+ spin_unlock_bh(&tb->tb6_lock);
}
/* Called with rcu_read_lock() */
@@ -421,7 +428,7 @@ static int fib6_dump_node(struct fib6_walker *w)
int res;
struct rt6_info *rt;
- for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+ for_each_fib6_walker_rt(w) {
res = rt6_dump_route(rt, w->args);
if (res < 0) {
/* Frame is full, suspend walking */
@@ -480,9 +487,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
w->count = 0;
w->skip = 0;
- read_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&table->tb6_lock);
res = fib6_walk(net, w);
- read_unlock_bh(&table->tb6_lock);
+ spin_unlock_bh(&table->tb6_lock);
if (res > 0) {
cb->args[4] = 1;
cb->args[5] = w->root->fn_sernum;
@@ -497,9 +504,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
} else
w->skip = 0;
- read_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&table->tb6_lock);
res = fib6_walk_continue(w);
- read_unlock_bh(&table->tb6_lock);
+ spin_unlock_bh(&table->tb6_lock);
if (res <= 0) {
fib6_walker_unlink(net, w);
cb->args[4] = 0;
@@ -580,11 +587,13 @@ out:
* node.
*/
-static struct fib6_node *fib6_add_1(struct fib6_node *root,
- struct in6_addr *addr, int plen,
- int offset, int allow_create,
- int replace_required, int sernum,
- struct netlink_ext_ack *extack)
+static struct fib6_node *fib6_add_1(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *root,
+ struct in6_addr *addr, int plen,
+ int offset, int allow_create,
+ int replace_required,
+ struct netlink_ext_ack *extack)
{
struct fib6_node *fn, *in, *ln;
struct fib6_node *pn = NULL;
@@ -599,7 +608,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
fn = root;
do {
- key = (struct rt6key *)((u8 *)fn->leaf + offset);
+ struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ key = (struct rt6key *)((u8 *)leaf + offset);
/*
* Prefix match
@@ -625,12 +636,10 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
if (plen == fn->fn_bit) {
/* clean up an intermediate node */
if (!(fn->fn_flags & RTN_RTINFO)) {
- rt6_release(fn->leaf);
- fn->leaf = NULL;
+ RCU_INIT_POINTER(fn->leaf, NULL);
+ rt6_release(leaf);
}
- fn->fn_sernum = sernum;
-
return fn;
}
@@ -639,10 +648,13 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
*/
/* Try to walk down on tree. */
- fn->fn_sernum = sernum;
dir = addr_bit_set(addr, fn->fn_bit);
pn = fn;
- fn = dir ? fn->right : fn->left;
+ fn = dir ?
+ rcu_dereference_protected(fn->right,
+ lockdep_is_held(&table->tb6_lock)) :
+ rcu_dereference_protected(fn->left,
+ lockdep_is_held(&table->tb6_lock));
} while (fn);
if (!allow_create) {
@@ -668,19 +680,17 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
* Create new leaf node without children.
*/
- ln = node_alloc();
+ ln = node_alloc(net);
if (!ln)
return ERR_PTR(-ENOMEM);
ln->fn_bit = plen;
-
- ln->parent = pn;
- ln->fn_sernum = sernum;
+ RCU_INIT_POINTER(ln->parent, pn);
if (dir)
- pn->right = ln;
+ rcu_assign_pointer(pn->right, ln);
else
- pn->left = ln;
+ rcu_assign_pointer(pn->left, ln);
return ln;
@@ -694,7 +704,8 @@ insert_above:
* and the current
*/
- pn = fn->parent;
+ pn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&table->tb6_lock));
/* find 1st bit in difference between the 2 addrs.
@@ -710,14 +721,14 @@ insert_above:
* (new leaf node)[ln] (old node)[fn]
*/
if (plen > bit) {
- in = node_alloc();
- ln = node_alloc();
+ in = node_alloc(net);
+ ln = node_alloc(net);
if (!in || !ln) {
if (in)
- node_free_immediate(in);
+ node_free_immediate(net, in);
if (ln)
- node_free_immediate(ln);
+ node_free_immediate(net, ln);
return ERR_PTR(-ENOMEM);
}
@@ -731,31 +742,28 @@ insert_above:
in->fn_bit = bit;
- in->parent = pn;
+ RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
- atomic_inc(&in->leaf->rt6i_ref);
-
- in->fn_sernum = sernum;
+ atomic_inc(&rcu_dereference_protected(in->leaf,
+ lockdep_is_held(&table->tb6_lock))->rt6i_ref);
/* update parent pointer */
if (dir)
- pn->right = in;
+ rcu_assign_pointer(pn->right, in);
else
- pn->left = in;
+ rcu_assign_pointer(pn->left, in);
ln->fn_bit = plen;
- ln->parent = in;
- fn->parent = in;
-
- ln->fn_sernum = sernum;
+ RCU_INIT_POINTER(ln->parent, in);
+ rcu_assign_pointer(fn->parent, in);
if (addr_bit_set(addr, bit)) {
- in->right = ln;
- in->left = fn;
+ rcu_assign_pointer(in->right, ln);
+ rcu_assign_pointer(in->left, fn);
} else {
- in->left = ln;
- in->right = fn;
+ rcu_assign_pointer(in->left, ln);
+ rcu_assign_pointer(in->right, fn);
}
} else { /* plen <= bit */
@@ -765,28 +773,26 @@ insert_above:
* (old node)[fn] NULL
*/
- ln = node_alloc();
+ ln = node_alloc(net);
if (!ln)
return ERR_PTR(-ENOMEM);
ln->fn_bit = plen;
- ln->parent = pn;
-
- ln->fn_sernum = sernum;
-
- if (dir)
- pn->right = ln;
- else
- pn->left = ln;
+ RCU_INIT_POINTER(ln->parent, pn);
if (addr_bit_set(&key->addr, plen))
- ln->right = fn;
+ RCU_INIT_POINTER(ln->right, fn);
else
- ln->left = fn;
+ RCU_INIT_POINTER(ln->left, fn);
+
+ rcu_assign_pointer(fn->parent, ln);
- fn->parent = ln;
+ if (dir)
+ rcu_assign_pointer(pn->right, ln);
+ else
+ rcu_assign_pointer(pn->left, ln);
}
return ln;
}
@@ -832,6 +838,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
struct net *net)
{
+ struct fib6_table *table = rt->rt6i_table;
+
if (atomic_read(&rt->rt6i_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
@@ -840,12 +848,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
* to still alive ones.
*/
while (fn) {
- if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
- fn->leaf = fib6_find_prefix(net, fn);
- atomic_inc(&fn->leaf->rt6i_ref);
+ struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ struct rt6_info *new_leaf;
+ if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
+ new_leaf = fib6_find_prefix(net, table, fn);
+ atomic_inc(&new_leaf->rt6i_ref);
+ rcu_assign_pointer(fn->leaf, new_leaf);
rt6_release(rt);
}
- fn = fn->parent;
+ fn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&table->tb6_lock));
}
}
}
@@ -857,9 +870,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
struct nl_info *info, struct mx6_config *mxc)
{
+ struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
struct rt6_info *iter = NULL;
- struct rt6_info **ins;
- struct rt6_info **fallback_ins = NULL;
+ struct rt6_info __rcu **ins;
+ struct rt6_info __rcu **fallback_ins = NULL;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
int add = (!info->nlh ||
@@ -874,7 +889,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
ins = &fn->leaf;
- for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
+ for (iter = leaf; iter;
+ iter = rcu_dereference_protected(iter->dst.rt6_next,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
/*
* Search for duplicates
*/
@@ -936,7 +953,8 @@ next_iter:
if (fallback_ins && !found) {
/* No ECMP-able route found, replace first non-ECMP one */
ins = fallback_ins;
- iter = *ins;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
found++;
}
@@ -950,7 +968,7 @@ next_iter:
struct rt6_info *sibling, *temp_sibling;
/* Find the first route that have the same metric */
- sibling = fn->leaf;
+ sibling = leaf;
while (sibling) {
if (sibling->rt6i_metric == rt->rt6i_metric &&
rt6_qualify_for_ecmp(sibling)) {
@@ -958,7 +976,8 @@ next_iter:
&sibling->rt6i_siblings);
break;
}
- sibling = sibling->dst.rt6_next;
+ sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
}
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
@@ -987,10 +1006,10 @@ add:
if (err)
return err;
- rt->dst.rt6_next = iter;
- *ins = rt;
- rcu_assign_pointer(rt->rt6i_node, fn);
+ rcu_assign_pointer(rt->dst.rt6_next, iter);
atomic_inc(&rt->rt6i_ref);
+ rcu_assign_pointer(rt->rt6i_node, fn);
+ rcu_assign_pointer(*ins, rt);
call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
rt);
if (!info->skip_notify)
@@ -1016,10 +1035,10 @@ add:
if (err)
return err;
- *ins = rt;
+ atomic_inc(&rt->rt6i_ref);
rcu_assign_pointer(rt->rt6i_node, fn);
rt->dst.rt6_next = iter->dst.rt6_next;
- atomic_inc(&rt->rt6i_ref);
+ rcu_assign_pointer(*ins, rt);
call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
rt);
if (!info->skip_notify)
@@ -1031,14 +1050,15 @@ add:
nsiblings = iter->rt6i_nsiblings;
iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
- if (fn->rr_ptr == iter)
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
rt6_release(iter);
if (nsiblings) {
/* Replacing an ECMP route, remove all siblings */
ins = &rt->dst.rt6_next;
- iter = *ins;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
while (iter) {
if (iter->rt6i_metric > rt->rt6i_metric)
break;
@@ -1046,14 +1066,16 @@ add:
*ins = iter->dst.rt6_next;
iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
- if (fn->rr_ptr == iter)
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
rt6_release(iter);
nsiblings--;
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
ins = &iter->dst.rt6_next;
}
- iter = *ins;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
}
WARN_ON(nsiblings != 0);
}
@@ -1077,16 +1099,33 @@ void fib6_force_start_gc(struct net *net)
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
+static void fib6_update_sernum_upto_root(struct rt6_info *rt,
+ int sernum)
+{
+ struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
+
+ /* paired with smp_rmb() in rt6_get_cookie_safe() */
+ smp_wmb();
+ while (fn) {
+ fn->fn_sernum = sernum;
+ fn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ }
+}
+
/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
* with source addr info in sub-trees
+ * Need to own table->tb6_lock
*/
int fib6_add(struct fib6_node *root, struct rt6_info *rt,
struct nl_info *info, struct mx6_config *mxc,
struct netlink_ext_ack *extack)
{
+ struct fib6_table *table = rt->rt6i_table;
struct fib6_node *fn, *pn = NULL;
int err = -ENOMEM;
int allow_create = 1;
@@ -1095,6 +1134,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
return -EINVAL;
+ if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
+ return -EINVAL;
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1105,9 +1146,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (!allow_create && !replace_required)
pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
- fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
+ fn = fib6_add_1(info->nl_net, table, root,
+ &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
offsetof(struct rt6_info, rt6i_dst), allow_create,
- replace_required, sernum, extack);
+ replace_required, extack);
if (IS_ERR(fn)) {
err = PTR_ERR(fn);
fn = NULL;
@@ -1120,7 +1162,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (rt->rt6i_src.plen) {
struct fib6_node *sn;
- if (!fn->subtree) {
+ if (!rcu_access_pointer(fn->subtree)) {
struct fib6_node *sfn;
/*
@@ -1134,42 +1176,40 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
*/
/* Create subtree root node */
- sfn = node_alloc();
+ sfn = node_alloc(info->nl_net);
if (!sfn)
goto failure;
- sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+ rcu_assign_pointer(sfn->leaf,
+ info->nl_net->ipv6.ip6_null_entry);
sfn->fn_flags = RTN_ROOT;
- sfn->fn_sernum = sernum;
/* Now add the first leaf node to new subtree */
- sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
- rt->rt6i_src.plen,
+ sn = fib6_add_1(info->nl_net, table, sfn,
+ &rt->rt6i_src.addr, rt->rt6i_src.plen,
offsetof(struct rt6_info, rt6i_src),
- allow_create, replace_required, sernum,
- extack);
+ allow_create, replace_required, extack);
if (IS_ERR(sn)) {
/* If it is failed, discard just allocated
root, and then (in failure) stale node
in main tree.
*/
- node_free_immediate(sfn);
+ node_free_immediate(info->nl_net, sfn);
err = PTR_ERR(sn);
goto failure;
}
/* Now link new subtree to main tree */
- sfn->parent = fn;
- fn->subtree = sfn;
+ rcu_assign_pointer(sfn->parent, fn);
+ rcu_assign_pointer(fn->subtree, sfn);
} else {
- sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
- rt->rt6i_src.plen,
+ sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
+ &rt->rt6i_src.addr, rt->rt6i_src.plen,
offsetof(struct rt6_info, rt6i_src),
- allow_create, replace_required, sernum,
- extack);
+ allow_create, replace_required, extack);
if (IS_ERR(sn)) {
err = PTR_ERR(sn);
@@ -1177,9 +1217,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
}
}
- if (!fn->leaf) {
- fn->leaf = rt;
+ if (!rcu_access_pointer(fn->leaf)) {
atomic_inc(&rt->rt6i_ref);
+ rcu_assign_pointer(fn->leaf, rt);
}
fn = sn;
}
@@ -1187,9 +1227,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
err = fib6_add_rt2node(fn, rt, info, mxc);
if (!err) {
+ fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
- if (!(rt->rt6i_flags & RTF_CACHE))
- fib6_prune_clones(info->nl_net, pn);
}
out:
@@ -1199,19 +1238,23 @@ out:
* If fib6_add_1 has cleared the old leaf pointer in the
* super-tree leaf node we have to find a new one for it.
*/
- if (pn != fn && pn->leaf == rt) {
- pn->leaf = NULL;
+ struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (pn != fn && pn_leaf == rt) {
+ pn_leaf = NULL;
+ RCU_INIT_POINTER(pn->leaf, NULL);
atomic_dec(&rt->rt6i_ref);
}
- if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
- pn->leaf = fib6_find_prefix(info->nl_net, pn);
+ if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+ pn_leaf = fib6_find_prefix(info->nl_net, table, pn);
#if RT6_DEBUG >= 2
- if (!pn->leaf) {
- WARN_ON(pn->leaf == NULL);
- pn->leaf = info->nl_net->ipv6.ip6_null_entry;
+ if (!pn_leaf) {
+ WARN_ON(!pn_leaf);
+ pn_leaf = info->nl_net->ipv6.ip6_null_entry;
}
#endif
- atomic_inc(&pn->leaf->rt6i_ref);
+ atomic_inc(&pn_leaf->rt6i_ref);
+ rcu_assign_pointer(pn->leaf, pn_leaf);
}
#endif
goto failure;
@@ -1226,7 +1269,7 @@ failure:
* fn->leaf.
*/
if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
- fib6_repair_tree(info->nl_net, fn);
+ fib6_repair_tree(info->nl_net, table, fn);
/* Always release dst as dst->__refcnt is guaranteed
* to be taken before entering this function
*/
@@ -1264,7 +1307,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
dir = addr_bit_set(args->addr, fn->fn_bit);
- next = dir ? fn->right : fn->left;
+ next = dir ? rcu_dereference(fn->right) :
+ rcu_dereference(fn->left);
if (next) {
fn = next;
@@ -1274,18 +1318,22 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
}
while (fn) {
- if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+ struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
+ if (subtree || fn->fn_flags & RTN_RTINFO) {
+ struct rt6_info *leaf = rcu_dereference(fn->leaf);
struct rt6key *key;
- key = (struct rt6key *) ((u8 *) fn->leaf +
- args->offset);
+ if (!leaf)
+ goto backtrack;
+
+ key = (struct rt6key *) ((u8 *)leaf + args->offset);
if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
#ifdef CONFIG_IPV6_SUBTREES
- if (fn->subtree) {
+ if (subtree) {
struct fib6_node *sfn;
- sfn = fib6_lookup_1(fn->subtree,
- args + 1);
+ sfn = fib6_lookup_1(subtree, args + 1);
if (!sfn)
goto backtrack;
fn = sfn;
@@ -1295,18 +1343,18 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
return fn;
}
}
-#ifdef CONFIG_IPV6_SUBTREES
backtrack:
-#endif
if (fn->fn_flags & RTN_ROOT)
break;
- fn = fn->parent;
+ fn = rcu_dereference(fn->parent);
}
return NULL;
}
+/* called with rcu_read_lock() held
+ */
struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
@@ -1337,54 +1385,84 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
/*
* Get node with specified destination prefix (and source prefix,
* if subtrees are used)
+ * exact_match == true means we try to find fn with exact match of
+ * the passed in prefix addr
+ * exact_match == false means we try to find fn with longest prefix
+ * match of the passed in prefix addr. This is useful for finding fn
+ * for cached route as it will be stored in the exception table under
+ * the node with longest prefix length.
*/
static struct fib6_node *fib6_locate_1(struct fib6_node *root,
const struct in6_addr *addr,
- int plen, int offset)
+ int plen, int offset,
+ bool exact_match)
{
- struct fib6_node *fn;
+ struct fib6_node *fn, *prev = NULL;
for (fn = root; fn ; ) {
- struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+ struct rt6_info *leaf = rcu_dereference(fn->leaf);
+ struct rt6key *key;
+
+ /* This node is being deleted */
+ if (!leaf) {
+ if (plen <= fn->fn_bit)
+ goto out;
+ else
+ goto next;
+ }
+
+ key = (struct rt6key *)((u8 *)leaf + offset);
/*
* Prefix match
*/
if (plen < fn->fn_bit ||
!ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
- return NULL;
+ goto out;
if (plen == fn->fn_bit)
return fn;
+ prev = fn;
+
+next:
/*
* We have more bits to go
*/
if (addr_bit_set(addr, fn->fn_bit))
- fn = fn->right;
+ fn = rcu_dereference(fn->right);
else
- fn = fn->left;
+ fn = rcu_dereference(fn->left);
}
- return NULL;
+out:
+ if (exact_match)
+ return NULL;
+ else
+ return prev;
}
struct fib6_node *fib6_locate(struct fib6_node *root,
const struct in6_addr *daddr, int dst_len,
- const struct in6_addr *saddr, int src_len)
+ const struct in6_addr *saddr, int src_len,
+ bool exact_match)
{
struct fib6_node *fn;
fn = fib6_locate_1(root, daddr, dst_len,
- offsetof(struct rt6_info, rt6i_dst));
+ offsetof(struct rt6_info, rt6i_dst),
+ exact_match);
#ifdef CONFIG_IPV6_SUBTREES
if (src_len) {
+ struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
WARN_ON(saddr == NULL);
- if (fn && fn->subtree)
- fn = fib6_locate_1(fn->subtree, saddr, src_len,
- offsetof(struct rt6_info, rt6i_src));
+ if (fn && subtree)
+ fn = fib6_locate_1(subtree, saddr, src_len,
+ offsetof(struct rt6_info, rt6i_src),
+ exact_match);
}
#endif
@@ -1400,16 +1478,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
*
*/
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
+static struct rt6_info *fib6_find_prefix(struct net *net,
+ struct fib6_table *table,
+ struct fib6_node *fn)
{
+ struct fib6_node *child_left, *child_right;
+
if (fn->fn_flags & RTN_ROOT)
return net->ipv6.ip6_null_entry;
while (fn) {
- if (fn->left)
- return fn->left->leaf;
- if (fn->right)
- return fn->right->leaf;
+ child_left = rcu_dereference_protected(fn->left,
+ lockdep_is_held(&table->tb6_lock));
+ child_right = rcu_dereference_protected(fn->right,
+ lockdep_is_held(&table->tb6_lock));
+ if (child_left)
+ return rcu_dereference_protected(child_left->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (child_right)
+ return rcu_dereference_protected(child_right->leaf,
+ lockdep_is_held(&table->tb6_lock));
fn = FIB6_SUBTREE(fn);
}
@@ -1419,31 +1507,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
/*
* Called to trim the tree of intermediate nodes when possible. "fn"
* is the node we want to try and remove.
+ * Need to own table->tb6_lock
*/
static struct fib6_node *fib6_repair_tree(struct net *net,
- struct fib6_node *fn)
+ struct fib6_table *table,
+ struct fib6_node *fn)
{
int children;
int nstate;
- struct fib6_node *child, *pn;
+ struct fib6_node *child;
struct fib6_walker *w;
int iter = 0;
for (;;) {
+ struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *pn = rcu_dereference_protected(fn->parent,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
+ lockdep_is_held(&table->tb6_lock));
+ struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
+ lockdep_is_held(&table->tb6_lock));
+ struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ struct rt6_info *new_fn_leaf;
+
RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
iter++;
WARN_ON(fn->fn_flags & RTN_RTINFO);
WARN_ON(fn->fn_flags & RTN_TL_ROOT);
- WARN_ON(fn->leaf);
+ WARN_ON(fn_leaf);
children = 0;
child = NULL;
- if (fn->right)
- child = fn->right, children |= 1;
- if (fn->left)
- child = fn->left, children |= 2;
+ if (fn_r)
+ child = fn_r, children |= 1;
+ if (fn_l)
+ child = fn_l, children |= 2;
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
@@ -1451,36 +1557,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
|| (children && fn->fn_flags & RTN_ROOT)
#endif
) {
- fn->leaf = fib6_find_prefix(net, fn);
+ new_fn_leaf = fib6_find_prefix(net, table, fn);
#if RT6_DEBUG >= 2
- if (!fn->leaf) {
- WARN_ON(!fn->leaf);
- fn->leaf = net->ipv6.ip6_null_entry;
+ if (!new_fn_leaf) {
+ WARN_ON(!new_fn_leaf);
+ new_fn_leaf = net->ipv6.ip6_null_entry;
}
#endif
- atomic_inc(&fn->leaf->rt6i_ref);
- return fn->parent;
+ atomic_inc(&new_fn_leaf->rt6i_ref);
+ rcu_assign_pointer(fn->leaf, new_fn_leaf);
+ return pn;
}
- pn = fn->parent;
#ifdef CONFIG_IPV6_SUBTREES
if (FIB6_SUBTREE(pn) == fn) {
WARN_ON(!(fn->fn_flags & RTN_ROOT));
- FIB6_SUBTREE(pn) = NULL;
+ RCU_INIT_POINTER(pn->subtree, NULL);
nstate = FWS_L;
} else {
WARN_ON(fn->fn_flags & RTN_ROOT);
#endif
- if (pn->right == fn)
- pn->right = child;
- else if (pn->left == fn)
- pn->left = child;
+ if (pn_r == fn)
+ rcu_assign_pointer(pn->right, child);
+ else if (pn_l == fn)
+ rcu_assign_pointer(pn->left, child);
#if RT6_DEBUG >= 2
else
WARN_ON(1);
#endif
if (child)
- child->parent = pn;
+ rcu_assign_pointer(child->parent, pn);
nstate = FWS_R;
#ifdef CONFIG_IPV6_SUBTREES
}
@@ -1489,19 +1595,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
read_lock(&net->ipv6.fib6_walker_lock);
FOR_WALKERS(net, w) {
if (!child) {
- if (w->root == fn) {
- w->root = w->node = NULL;
- RT6_TRACE("W %p adjusted by delroot 1\n", w);
- } else if (w->node == fn) {
+ if (w->node == fn) {
RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
w->node = pn;
w->state = nstate;
}
} else {
- if (w->root == fn) {
- w->root = child;
- RT6_TRACE("W %p adjusted by delroot 2\n", w);
- }
if (w->node == fn) {
w->node = child;
if (children&2) {
@@ -1516,33 +1615,39 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
}
read_unlock(&net->ipv6.fib6_walker_lock);
- node_free(fn);
+ node_free(net, fn);
if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
return pn;
- rt6_release(pn->leaf);
- pn->leaf = NULL;
+ RCU_INIT_POINTER(pn->leaf, NULL);
+ rt6_release(pn_leaf);
fn = pn;
}
}
-static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
- struct nl_info *info)
+static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
+ struct rt6_info __rcu **rtp, struct nl_info *info)
{
struct fib6_walker *w;
- struct rt6_info *rt = *rtp;
+ struct rt6_info *rt = rcu_dereference_protected(*rtp,
+ lockdep_is_held(&table->tb6_lock));
struct net *net = info->nl_net;
RT6_TRACE("fib6_del_route\n");
+ WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
+
/* Unlink it */
*rtp = rt->dst.rt6_next;
rt->rt6i_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
+ /* Flush all cached dst in exception table */
+ rt6_flush_exceptions(rt);
+
/* Reset round-robin state, if necessary */
- if (fn->rr_ptr == rt)
+ if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
/* Remove this entry from other siblings */
@@ -1561,20 +1666,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
- w->leaf = rt->dst.rt6_next;
+ w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+ lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
w->state = FWS_U;
}
}
read_unlock(&net->ipv6.fib6_walker_lock);
- rt->dst.rt6_next = NULL;
-
/* If it was last route, expunge its radix tree node */
- if (!fn->leaf) {
+ if (!rcu_access_pointer(fn->leaf)) {
fn->fn_flags &= ~RTN_RTINFO;
net->ipv6.rt6_stats->fib_route_nodes--;
- fn = fib6_repair_tree(net, fn);
+ fn = fib6_repair_tree(net, table, fn);
}
fib6_purge_rt(rt, fn, net);
@@ -1585,12 +1689,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
rt6_release(rt);
}
+/* Need to own table->tb6_lock */
int fib6_del(struct rt6_info *rt, struct nl_info *info)
{
struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ struct fib6_table *table = rt->rt6i_table;
struct net *net = info->nl_net;
- struct rt6_info **rtp;
+ struct rt6_info __rcu **rtp;
+ struct rt6_info __rcu **rtp_next;
#if RT6_DEBUG >= 2
if (rt->dst.obsolete > 0) {
@@ -1603,28 +1710,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
- if (!(rt->rt6i_flags & RTF_CACHE)) {
- struct fib6_node *pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
- /* clones of this route might be in another subtree */
- if (rt->rt6i_src.plen) {
- while (!(pn->fn_flags & RTN_ROOT))
- pn = pn->parent;
- pn = pn->parent;
- }
-#endif
- fib6_prune_clones(info->nl_net, pn);
- }
+ /* remove cached dst from exception table */
+ if (rt->rt6i_flags & RTF_CACHE)
+ return rt6_remove_exception_rt(rt);
/*
* Walk the leaf entries looking for ourself
*/
- for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
- if (*rtp == rt) {
- fib6_del_route(fn, rtp, info);
+ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
+ struct rt6_info *cur = rcu_dereference_protected(*rtp,
+ lockdep_is_held(&table->tb6_lock));
+ if (rt == cur) {
+ fib6_del_route(table, fn, rtp, info);
return 0;
}
+ rtp_next = &cur->dst.rt6_next;
}
return -ENOENT;
}
@@ -1651,22 +1752,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
* 0 -> walk is complete.
* >0 -> walk is incomplete (i.e. suspended)
* <0 -> walk is terminated by an error.
+ *
+ * This function is called with tb6_lock held.
*/
static int fib6_walk_continue(struct fib6_walker *w)
{
- struct fib6_node *fn, *pn;
+ struct fib6_node *fn, *pn, *left, *right;
+
+ /* w->root should always be table->tb6_root */
+ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
for (;;) {
fn = w->node;
if (!fn)
return 0;
- if (w->prune && fn != w->root &&
- fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
- w->state = FWS_C;
- w->leaf = fn->leaf;
- }
switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
case FWS_S:
@@ -1677,20 +1778,22 @@ static int fib6_walk_continue(struct fib6_walker *w)
w->state = FWS_L;
#endif
case FWS_L:
- if (fn->left) {
- w->node = fn->left;
+ left = rcu_dereference_protected(fn->left, 1);
+ if (left) {
+ w->node = left;
w->state = FWS_INIT;
continue;
}
w->state = FWS_R;
case FWS_R:
- if (fn->right) {
- w->node = fn->right;
+ right = rcu_dereference_protected(fn->right, 1);
+ if (right) {
+ w->node = right;
w->state = FWS_INIT;
continue;
}
w->state = FWS_C;
- w->leaf = fn->leaf;
+ w->leaf = rcu_dereference_protected(fn->leaf, 1);
case FWS_C:
if (w->leaf && fn->fn_flags & RTN_RTINFO) {
int err;
@@ -1712,7 +1815,9 @@ skip:
case FWS_U:
if (fn == w->root)
return 0;
- pn = fn->parent;
+ pn = rcu_dereference_protected(fn->parent, 1);
+ left = rcu_dereference_protected(pn->left, 1);
+ right = rcu_dereference_protected(pn->right, 1);
w->node = pn;
#ifdef CONFIG_IPV6_SUBTREES
if (FIB6_SUBTREE(pn) == fn) {
@@ -1721,13 +1826,13 @@ skip:
continue;
}
#endif
- if (pn->left == fn) {
+ if (left == fn) {
w->state = FWS_R;
continue;
}
- if (pn->right == fn) {
+ if (right == fn) {
w->state = FWS_C;
- w->leaf = w->node->leaf;
+ w->leaf = rcu_dereference_protected(w->node->leaf, 1);
continue;
}
#if RT6_DEBUG >= 2
@@ -1770,7 +1875,7 @@ static int fib6_clean_node(struct fib6_walker *w)
return 0;
}
- for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+ for_each_fib6_walker_rt(w) {
res = c->func(rt, c->arg);
if (res < 0) {
w->leaf = rt;
@@ -1798,20 +1903,16 @@ static int fib6_clean_node(struct fib6_walker *w)
* func is called on each route.
* It may return -1 -> delete this route.
* 0 -> continue walking
- *
- * prune==1 -> only immediate children of node (certainly,
- * ignoring pure split nodes) will be scanned.
*/
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
int (*func)(struct rt6_info *, void *arg),
- bool prune, int sernum, void *arg)
+ int sernum, void *arg)
{
struct fib6_cleaner c;
c.w.root = root;
c.w.func = fib6_clean_node;
- c.w.prune = prune;
c.w.count = 0;
c.w.skip = 0;
c.func = func;
@@ -1834,10 +1935,10 @@ static void __fib6_clean_all(struct net *net,
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
head = &net->ipv6.fib_table_hash[h];
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
- write_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
- func, false, sernum, arg);
- write_unlock_bh(&table->tb6_lock);
+ func, sernum, arg);
+ spin_unlock_bh(&table->tb6_lock);
}
}
rcu_read_unlock();
@@ -1849,22 +1950,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
}
-static int fib6_prune_clone(struct rt6_info *rt, void *arg)
-{
- if (rt->rt6i_flags & RTF_CACHE) {
- RT6_TRACE("pruning clone %p\n", rt);
- return -1;
- }
-
- return 0;
-}
-
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
-{
- fib6_clean_tree(net, fn, fib6_prune_clone, true,
- FIB6_NO_SERNUM_CHANGE, NULL);
-}
-
static void fib6_flush_trees(struct net *net)
{
int new_sernum = fib6_new_sernum(net);
@@ -1876,12 +1961,6 @@ static void fib6_flush_trees(struct net *net)
* Garbage collection
*/
-struct fib6_gc_args
-{
- int timeout;
- int more;
-};
-
static int fib6_age(struct rt6_info *rt, void *arg)
{
struct fib6_gc_args *gc_args = arg;
@@ -1890,9 +1969,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
/*
* check addrconf expiration here.
* Routes are expired even if they are in use.
- *
- * Also age clones. Note, that clones are aged out
- * only if they are not in use now.
*/
if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
@@ -1901,31 +1977,14 @@ static int fib6_age(struct rt6_info *rt, void *arg)
return -1;
}
gc_args->more++;
- } else if (rt->rt6i_flags & RTF_CACHE) {
- if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
- rt->dst.obsolete = DST_OBSOLETE_KILL;
- if (atomic_read(&rt->dst.__refcnt) == 1 &&
- rt->dst.obsolete == DST_OBSOLETE_KILL) {
- RT6_TRACE("aging clone %p\n", rt);
- return -1;
- } else if (rt->rt6i_flags & RTF_GATEWAY) {
- struct neighbour *neigh;
- __u8 neigh_flags = 0;
-
- neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
- if (neigh) {
- neigh_flags = neigh->flags;
- neigh_release(neigh);
- }
- if (!(neigh_flags & NTF_ROUTER)) {
- RT6_TRACE("purging route %p via non-router but gateway\n",
- rt);
- return -1;
- }
- }
- gc_args->more++;
}
+ /* Also age clones in the exception table.
+ * Note, that clones are aged out
+ * only if they are not in use now.
+ */
+ rt6_age_exceptions(rt, gc_args, now);
+
return 0;
}
@@ -1993,7 +2052,8 @@ static int __net_init fib6_net_init(struct net *net)
goto out_fib_table_hash;
net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
- net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
+ net->ipv6.ip6_null_entry);
net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2004,7 +2064,8 @@ static int __net_init fib6_net_init(struct net *net)
if (!net->ipv6.fib6_local_tbl)
goto out_fib6_main_tbl;
net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
- net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
+ net->ipv6.ip6_null_entry);
net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2134,7 +2195,9 @@ static int ipv6_route_yield(struct fib6_walker *w)
return 1;
do {
- iter->w.leaf = iter->w.leaf->dst.rt6_next;
+ iter->w.leaf = rcu_dereference_protected(
+ iter->w.leaf->dst.rt6_next,
+ lockdep_is_held(&iter->tbl->tb6_lock));
iter->skip--;
if (!iter->skip && iter->w.leaf)
return 1;
@@ -2199,7 +2262,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (!v)
goto iter_table;
- n = ((struct rt6_info *)v)->dst.rt6_next;
+ n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next);
if (n) {
++*pos;
return n;
@@ -2207,9 +2270,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
iter_table:
ipv6_route_check_sernum(iter);
- read_lock(&iter->tbl->tb6_lock);
+ spin_lock_bh(&iter->tbl->tb6_lock);
r = fib6_walk_continue(&iter->w);
- read_unlock(&iter->tbl->tb6_lock);
+ spin_unlock_bh(&iter->tbl->tb6_lock);
if (r > 0) {
if (v)
++*pos;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 26cc9f483b6d..399d1bceec4a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -44,6 +44,7 @@
#include <linux/seq_file.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
+#include <linux/jhash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
@@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net,
struct in6_addr *dst, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags);
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+ struct in6_addr *daddr,
+ struct in6_addr *saddr);
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -139,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt)
{
if (!list_empty(&rt->rt6i_uncached)) {
struct uncached_list *ul = rt->rt6i_uncached_list;
+ struct net *net = dev_net(rt->dst.dev);
spin_lock_bh(&ul->lock);
list_del(&rt->rt6i_uncached);
+ atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
spin_unlock_bh(&ul->lock);
}
}
@@ -355,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
1, DST_OBSOLETE_FORCE_CHK, flags);
- if (rt)
+ if (rt) {
rt6_info_init(rt);
+ atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+ }
return rt;
}
@@ -392,6 +400,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
+ struct rt6_exception_bucket *bucket;
struct dst_entry *from = dst->from;
struct inet6_dev *idev;
@@ -404,6 +413,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
+ bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
+ if (bucket) {
+ rt->rt6i_exception_bucket = NULL;
+ kfree(bucket);
+ }
dst->from = NULL;
dst_release(from);
@@ -478,7 +492,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
}
/*
- * Route lookup. Any table->tb6_lock is implied.
+ * Route lookup. rcu_read_lock() should be held.
*/
static inline struct rt6_info *rt6_device_match(struct net *net,
@@ -493,7 +507,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
if (!oif && ipv6_addr_any(saddr))
goto out;
- for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+ for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
struct net_device *dev = sprt->dst.dev;
if (oif) {
@@ -702,6 +716,7 @@ out:
}
static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+ struct rt6_info *leaf,
struct rt6_info *rr_head,
u32 metric, int oif, int strict,
bool *do_rr)
@@ -711,7 +726,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
match = NULL;
cont = NULL;
- for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+ for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
if (rt->rt6i_metric != metric) {
cont = rt;
break;
@@ -720,7 +735,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
match = find_match(rt, oif, strict, &mpri, match, do_rr);
}
- for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+ for (rt = leaf; rt && rt != rr_head;
+ rt = rcu_dereference(rt->dst.rt6_next)) {
if (rt->rt6i_metric != metric) {
cont = rt;
break;
@@ -732,37 +748,59 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
if (match || !cont)
return match;
- for (rt = cont; rt; rt = rt->dst.rt6_next)
+ for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
}
-static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+ int oif, int strict)
{
+ struct rt6_info *leaf = rcu_dereference(fn->leaf);
struct rt6_info *match, *rt0;
- struct net *net;
bool do_rr = false;
+ int key_plen;
+
+ if (!leaf)
+ return net->ipv6.ip6_null_entry;
- rt0 = fn->rr_ptr;
+ rt0 = rcu_dereference(fn->rr_ptr);
if (!rt0)
- fn->rr_ptr = rt0 = fn->leaf;
+ rt0 = leaf;
- match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
+ /* Double check to make sure fn is not an intermediate node
+ * and fn->leaf does not points to its child's leaf
+ * (This might happen if all routes under fn are deleted from
+ * the tree and fib6_repair_tree() is called on the node.)
+ */
+ key_plen = rt0->rt6i_dst.plen;
+#ifdef CONFIG_IPV6_SUBTREES
+ if (rt0->rt6i_src.plen)
+ key_plen = rt0->rt6i_src.plen;
+#endif
+ if (fn->fn_bit != key_plen)
+ return net->ipv6.ip6_null_entry;
+
+ match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
&do_rr);
if (do_rr) {
- struct rt6_info *next = rt0->dst.rt6_next;
+ struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
/* no entries matched; do round-robin */
if (!next || next->rt6i_metric != rt0->rt6i_metric)
- next = fn->leaf;
-
- if (next != rt0)
- fn->rr_ptr = next;
+ next = leaf;
+
+ if (next != rt0) {
+ spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+ /* make sure next is not being deleted from the tree */
+ if (next->rt6i_node)
+ rcu_assign_pointer(fn->rr_ptr, next);
+ spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+ }
}
- net = dev_net(rt0->dst.dev);
return match ? match : net->ipv6.ip6_null_entry;
}
@@ -850,13 +888,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
struct in6_addr *saddr)
{
- struct fib6_node *pn;
+ struct fib6_node *pn, *sn;
while (1) {
if (fn->fn_flags & RTN_TL_ROOT)
return NULL;
- pn = fn->parent;
- if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
- fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+ pn = rcu_dereference(fn->parent);
+ sn = FIB6_SUBTREE(pn);
+ if (sn && sn != fn)
+ fn = fib6_lookup(sn, NULL, saddr);
else
fn = pn;
if (fn->fn_flags & RTN_RTINFO)
@@ -864,27 +903,57 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
}
}
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
+ bool null_fallback)
+{
+ struct rt6_info *rt = *prt;
+
+ if (dst_hold_safe(&rt->dst))
+ return true;
+ if (null_fallback) {
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ } else {
+ rt = NULL;
+ }
+ *prt = rt;
+ return false;
+}
+
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6, int flags)
{
+ struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
- struct rt6_info *rt;
- read_lock_bh(&table->tb6_lock);
+ rcu_read_lock();
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
- rt = fn->leaf;
- rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
- if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
- rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
+ rt = rcu_dereference(fn->leaf);
+ if (!rt) {
+ rt = net->ipv6.ip6_null_entry;
+ } else {
+ rt = rt6_device_match(net, rt, &fl6->saddr,
+ fl6->flowi6_oif, flags);
+ if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+ rt = rt6_multipath_select(rt, fl6,
+ fl6->flowi6_oif, flags);
+ }
if (rt == net->ipv6.ip6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
}
- dst_use(&rt->dst, jiffies);
- read_unlock_bh(&table->tb6_lock);
+ /* Search through exception table */
+ rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+ if (rt_cache)
+ rt = rt_cache;
+
+ if (ip6_hold_safe(net, &rt, true))
+ dst_use_noref(&rt->dst, jiffies);
+
+ rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
@@ -938,9 +1007,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
struct fib6_table *table;
table = rt->rt6i_table;
- write_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
- write_unlock_bh(&table->tb6_lock);
+ spin_unlock_bh(&table->tb6_lock);
return err;
}
@@ -1038,7 +1107,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
return pcpu_rt;
}
-/* It should be called with read_lock_bh(&tb6_lock) acquired */
+/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
{
struct rt6_info *pcpu_rt, **p;
@@ -1046,16 +1115,14 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
p = this_cpu_ptr(rt->rt6i_pcpu);
pcpu_rt = *p;
- if (pcpu_rt) {
- dst_hold(&pcpu_rt->dst);
+ if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
rt6_dst_from_metrics_check(pcpu_rt);
- }
+
return pcpu_rt;
}
static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
{
- struct fib6_table *table = rt->rt6i_table;
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(rt);
@@ -1066,36 +1133,520 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
return net->ipv6.ip6_null_entry;
}
- read_lock_bh(&table->tb6_lock);
- if (rt->rt6i_pcpu) {
- p = this_cpu_ptr(rt->rt6i_pcpu);
- prev = cmpxchg(p, NULL, pcpu_rt);
- if (prev) {
- /* If someone did it before us, return prev instead */
- dst_release_immediate(&pcpu_rt->dst);
- pcpu_rt = prev;
- }
- } else {
- /* rt has been removed from the fib6 tree
- * before we have a chance to acquire the read_lock.
- * In this case, don't brother to create a pcpu rt
- * since rt is going away anyway. The next
- * dst_check() will trigger a re-lookup.
- */
+ dst_hold(&pcpu_rt->dst);
+ p = this_cpu_ptr(rt->rt6i_pcpu);
+ prev = cmpxchg(p, NULL, pcpu_rt);
+ if (prev) {
+ /* If someone did it before us, return prev instead */
+ /* release refcnt taken by ip6_rt_pcpu_alloc() */
+ dst_release_immediate(&pcpu_rt->dst);
+ /* release refcnt taken by above dst_hold() */
dst_release_immediate(&pcpu_rt->dst);
- pcpu_rt = rt;
+ dst_hold(&prev->dst);
+ pcpu_rt = prev;
}
- dst_hold(&pcpu_rt->dst);
+
rt6_dst_from_metrics_check(pcpu_rt);
- read_unlock_bh(&table->tb6_lock);
return pcpu_rt;
}
+/* exception hash table implementation
+ */
+static DEFINE_SPINLOCK(rt6_exception_lock);
+
+/* Remove rt6_ex from hash table and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
+ struct rt6_exception *rt6_ex)
+{
+ struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
+
+ if (!bucket || !rt6_ex)
+ return;
+ rt6_ex->rt6i->rt6i_node = NULL;
+ hlist_del_rcu(&rt6_ex->hlist);
+ rt6_release(rt6_ex->rt6i);
+ kfree_rcu(rt6_ex, rcu);
+ WARN_ON_ONCE(!bucket->depth);
+ bucket->depth--;
+ net->ipv6.rt6_stats->fib_rt_cache--;
+}
+
+/* Remove oldest rt6_ex in bucket and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
+{
+ struct rt6_exception *rt6_ex, *oldest = NULL;
+
+ if (!bucket)
+ return;
+
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
+ oldest = rt6_ex;
+ }
+ rt6_remove_exception(bucket, oldest);
+}
+
+static u32 rt6_exception_hash(const struct in6_addr *dst,
+ const struct in6_addr *src)
+{
+ static u32 seed __read_mostly;
+ u32 val;
+
+ net_get_random_once(&seed, sizeof(seed));
+ val = jhash(dst, sizeof(*dst), seed);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (src)
+ val = jhash(src, sizeof(*src), val);
+#endif
+ return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rt6_exception_lock
+ */
+static struct rt6_exception *
+__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct rt6_exception *rt6_ex;
+ u32 hval;
+
+ if (!(*bucket) || !daddr)
+ return NULL;
+
+ hval = rt6_exception_hash(daddr, saddr);
+ *bucket += hval;
+
+ hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
+ struct rt6_info *rt6 = rt6_ex->rt6i;
+ bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (matched && saddr)
+ matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+ if (matched)
+ return rt6_ex;
+ }
+ return NULL;
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rcu_read_lock()
+ */
+static struct rt6_exception *
+__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct rt6_exception *rt6_ex;
+ u32 hval;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!(*bucket) || !daddr)
+ return NULL;
+
+ hval = rt6_exception_hash(daddr, saddr);
+ *bucket += hval;
+
+ hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
+ struct rt6_info *rt6 = rt6_ex->rt6i;
+ bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (matched && saddr)
+ matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+ if (matched)
+ return rt6_ex;
+ }
+ return NULL;
+}
+
+static int rt6_insert_exception(struct rt6_info *nrt,
+ struct rt6_info *ort)
+{
+ struct net *net = dev_net(ort->dst.dev);
+ struct rt6_exception_bucket *bucket;
+ struct in6_addr *src_key = NULL;
+ struct rt6_exception *rt6_ex;
+ int err = 0;
+
+ /* ort can't be a cache or pcpu route */
+ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+ ort = (struct rt6_info *)ort->dst.from;
+ WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
+
+ spin_lock_bh(&rt6_exception_lock);
+
+ if (ort->exception_bucket_flushed) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+ if (!bucket) {
+ bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
+ GFP_ATOMIC);
+ if (!bucket) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+ }
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* rt6i_src.plen != 0 indicates ort is in subtree
+ * and exception table is indexed by a hash of
+ * both rt6i_dst and rt6i_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only rt6i_dst.
+ */
+ if (ort->rt6i_src.plen)
+ src_key = &nrt->rt6i_src.addr;
+#endif
+
+ /* Update rt6i_prefsrc as it could be changed
+ * in rt6_remove_prefsrc()
+ */
+ nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ /* rt6_mtu_change() might lower mtu on ort.
+ * Only insert this exception route if its mtu
+ * is less than ort's mtu value.
+ */
+ if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
+ src_key);
+ if (rt6_ex)
+ rt6_remove_exception(bucket, rt6_ex);
+
+ rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
+ if (!rt6_ex) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rt6_ex->rt6i = nrt;
+ rt6_ex->stamp = jiffies;
+ atomic_inc(&nrt->rt6i_ref);
+ nrt->rt6i_node = ort->rt6i_node;
+ hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
+ bucket->depth++;
+ net->ipv6.rt6_stats->fib_rt_cache++;
+
+ if (bucket->depth > FIB6_MAX_DEPTH)
+ rt6_exception_remove_oldest(bucket);
+
+out:
+ spin_unlock_bh(&rt6_exception_lock);
+
+ /* Update fn->fn_sernum to invalidate all cached dst */
+ if (!err)
+ fib6_update_sernum(ort);
+
+ return err;
+}
+
+void rt6_flush_exceptions(struct rt6_info *rt)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct hlist_node *tmp;
+ int i;
+
+ spin_lock_bh(&rt6_exception_lock);
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
+ rt->exception_bucket_flushed = 1;
+
+ bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+ if (!bucket)
+ goto out;
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
+ rt6_remove_exception(bucket, rt6_ex);
+ WARN_ON_ONCE(bucket->depth);
+ bucket++;
+ }
+
+out:
+ spin_unlock_bh(&rt6_exception_lock);
+}
+
+/* Find cached rt in the hash table inside passed in rt
+ * Caller has to hold rcu_read_lock()
+ */
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+ struct in6_addr *daddr,
+ struct in6_addr *saddr)
+{
+ struct rt6_exception_bucket *bucket;
+ struct in6_addr *src_key = NULL;
+ struct rt6_exception *rt6_ex;
+ struct rt6_info *res = NULL;
+
+ bucket = rcu_dereference(rt->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* rt6i_src.plen != 0 indicates rt is in subtree
+ * and exception table is indexed by a hash of
+ * both rt6i_dst and rt6i_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only rt6i_dst.
+ */
+ if (rt->rt6i_src.plen)
+ src_key = saddr;
+#endif
+ rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+
+ if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+ res = rt6_ex->rt6i;
+
+ return res;
+}
+
+/* Remove the passed in cached rt from the hash table that contains it */
+int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+ struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+ struct rt6_exception_bucket *bucket;
+ struct in6_addr *src_key = NULL;
+ struct rt6_exception *rt6_ex;
+ int err;
+
+ if (!from ||
+ !(rt->rt6i_flags | RTF_CACHE))
+ return -EINVAL;
+
+ if (!rcu_access_pointer(from->rt6i_exception_bucket))
+ return -ENOENT;
+
+ spin_lock_bh(&rt6_exception_lock);
+ bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+#ifdef CONFIG_IPV6_SUBTREES
+ /* rt6i_src.plen != 0 indicates 'from' is in subtree
+ * and exception table is indexed by a hash of
+ * both rt6i_dst and rt6i_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only rt6i_dst.
+ */
+ if (from->rt6i_src.plen)
+ src_key = &rt->rt6i_src.addr;
+#endif
+ rt6_ex = __rt6_find_exception_spinlock(&bucket,
+ &rt->rt6i_dst.addr,
+ src_key);
+ if (rt6_ex) {
+ rt6_remove_exception(bucket, rt6_ex);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+
+ spin_unlock_bh(&rt6_exception_lock);
+ return err;
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+ struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+ struct rt6_exception_bucket *bucket;
+ struct in6_addr *src_key = NULL;
+ struct rt6_exception *rt6_ex;
+
+ if (!from ||
+ !(rt->rt6i_flags | RTF_CACHE))
+ return;
+
+ rcu_read_lock();
+ bucket = rcu_dereference(from->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* rt6i_src.plen != 0 indicates 'from' is in subtree
+ * and exception table is indexed by a hash of
+ * both rt6i_dst and rt6i_src.
+ * Otherwise, the exception table is indexed by
+ * a hash of only rt6i_dst.
+ */
+ if (from->rt6i_src.plen)
+ src_key = &rt->rt6i_src.addr;
+#endif
+ rt6_ex = __rt6_find_exception_rcu(&bucket,
+ &rt->rt6i_dst.addr,
+ src_key);
+ if (rt6_ex)
+ rt6_ex->stamp = jiffies;
+
+ rcu_read_unlock();
+}
+
+static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i;
+
+ bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+
+ if (bucket) {
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
+ }
+ bucket++;
+ }
+ }
+}
+
+static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i;
+
+ bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+
+ if (bucket) {
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ struct rt6_info *entry = rt6_ex->rt6i;
+ /* For RTF_CACHE with rt6i_pmtu == 0
+ * (i.e. a redirected route),
+ * the metrics of its rt->dst.from has already
+ * been updated.
+ */
+ if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
+ entry->rt6i_pmtu = mtu;
+ }
+ bucket++;
+ }
+ }
+}
+
+#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
+
+static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+ struct in6_addr *gateway)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct hlist_node *tmp;
+ int i;
+
+ if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ return;
+
+ spin_lock_bh(&rt6_exception_lock);
+ bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+
+ if (bucket) {
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry_safe(rt6_ex, tmp,
+ &bucket->chain, hlist) {
+ struct rt6_info *entry = rt6_ex->rt6i;
+
+ if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
+ RTF_CACHE_GATEWAY &&
+ ipv6_addr_equal(gateway,
+ &entry->rt6i_gateway)) {
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ }
+ bucket++;
+ }
+ }
+
+ spin_unlock_bh(&rt6_exception_lock);
+}
+
+static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
+ struct rt6_exception *rt6_ex,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ struct rt6_info *rt = rt6_ex->rt6i;
+
+ if (atomic_read(&rt->dst.__refcnt) == 1 &&
+ time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+ RT6_TRACE("aging clone %p\n", rt);
+ rt6_remove_exception(bucket, rt6_ex);
+ return;
+ } else if (rt->rt6i_flags & RTF_GATEWAY) {
+ struct neighbour *neigh;
+ __u8 neigh_flags = 0;
+
+ neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
+ if (neigh) {
+ neigh_flags = neigh->flags;
+ neigh_release(neigh);
+ }
+ if (!(neigh_flags & NTF_ROUTER)) {
+ RT6_TRACE("purging route %p via non-router but gateway\n",
+ rt);
+ rt6_remove_exception(bucket, rt6_ex);
+ return;
+ }
+ }
+ gc_args->more++;
+}
+
+void rt6_age_exceptions(struct rt6_info *rt,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct hlist_node *tmp;
+ int i;
+
+ if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ return;
+
+ spin_lock_bh(&rt6_exception_lock);
+ bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
+
+ if (bucket) {
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry_safe(rt6_ex, tmp,
+ &bucket->chain, hlist) {
+ rt6_age_examine_exception(bucket, rt6_ex,
+ gc_args, now);
+ }
+ bucket++;
+ }
+ }
+ spin_unlock_bh(&rt6_exception_lock);
+}
+
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
- struct rt6_info *rt;
+ struct rt6_info *rt, *rt_cache;
int strict = 0;
strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1103,7 +1654,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
- read_lock_bh(&table->tb6_lock);
+ rcu_read_lock();
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
@@ -1112,7 +1663,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
oif = 0;
redo_rt6_select:
- rt = rt6_select(fn, oif, strict);
+ rt = rt6_select(net, fn, oif, strict);
if (rt->rt6i_nsiblings)
rt = rt6_multipath_select(rt, fl6, oif, strict);
if (rt == net->ipv6.ip6_null_entry) {
@@ -1127,13 +1678,22 @@ redo_rt6_select:
}
}
+ /*Search through exception table */
+ rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+ if (rt_cache)
+ rt = rt_cache;
- if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
- dst_use(&rt->dst, jiffies);
- read_unlock_bh(&table->tb6_lock);
-
- rt6_dst_from_metrics_check(rt);
-
+ if (rt == net->ipv6.ip6_null_entry) {
+ rcu_read_unlock();
+ dst_hold(&rt->dst);
+ trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+ return rt;
+ } else if (rt->rt6i_flags & RTF_CACHE) {
+ if (ip6_hold_safe(net, &rt, true)) {
+ dst_use_noref(&rt->dst, jiffies);
+ rt6_dst_from_metrics_check(rt);
+ }
+ rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
@@ -1146,8 +1706,14 @@ redo_rt6_select:
struct rt6_info *uncached_rt;
- dst_use(&rt->dst, jiffies);
- read_unlock_bh(&table->tb6_lock);
+ if (ip6_hold_safe(net, &rt, true)) {
+ dst_use_noref(&rt->dst, jiffies);
+ } else {
+ rcu_read_unlock();
+ uncached_rt = rt;
+ goto uncached_rt_out;
+ }
+ rcu_read_unlock();
uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
dst_release(&rt->dst);
@@ -1157,11 +1723,13 @@ redo_rt6_select:
* No need for another dst_hold()
*/
rt6_uncached_list_add(uncached_rt);
+ atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
} else {
uncached_rt = net->ipv6.ip6_null_entry;
dst_hold(&uncached_rt->dst);
}
+uncached_rt_out:
trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
return uncached_rt;
@@ -1170,26 +1738,34 @@ redo_rt6_select:
struct rt6_info *pcpu_rt;
- rt->dst.lastuse = jiffies;
- rt->dst.__use++;
+ dst_use_noref(&rt->dst, jiffies);
pcpu_rt = rt6_get_pcpu_route(rt);
if (pcpu_rt) {
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
} else {
- /* We have to do the read_unlock first
- * because rt6_make_pcpu_route() may trigger
- * ip6_dst_gc() which will take the write_lock.
- */
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- pcpu_rt = rt6_make_pcpu_route(rt);
- dst_release(&rt->dst);
+ /* atomic_inc_not_zero() is needed when using rcu */
+ if (atomic_inc_not_zero(&rt->rt6i_ref)) {
+ /* We have to do the read_unlock first
+ * because rt6_make_pcpu_route() may trigger
+ * ip6_dst_gc() which will take the write_lock.
+ *
+ * No dst_hold() on rt is needed because grabbing
+ * rt->rt6i_ref makes sure rt can't be released.
+ */
+ rcu_read_unlock();
+ pcpu_rt = rt6_make_pcpu_route(rt);
+ rt6_release(rt);
+ } else {
+ /* rt is already removed from tree */
+ rcu_read_unlock();
+ pcpu_rt = net->ipv6.ip6_null_entry;
+ dst_hold(&pcpu_rt->dst);
+ }
}
trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
return pcpu_rt;
-
}
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -1328,6 +1904,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
DST_OBSOLETE_NONE, 0);
if (rt) {
rt6_info_init(rt);
+ atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
new = &rt->dst;
new->__use = 1;
@@ -1491,23 +2068,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (!rt6_cache_allowed_for_pmtu(rt6)) {
rt6_do_update_pmtu(rt6, mtu);
+ /* update rt6_ex->stamp for cache */
+ if (rt6->rt6i_flags & RTF_CACHE)
+ rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
struct rt6_info *nrt6;
nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
-
- /* ip6_ins_rt(nrt6) will bump the
- * rt6->rt6i_node->fn_sernum
- * which will fail the next rt6_check() and
- * invalidate the sk->sk_dst_cache.
- */
- ip6_ins_rt(nrt6);
- /* Release the reference taken in
- * ip6_rt_cache_alloc()
- */
- dst_release(&nrt6->dst);
+ if (rt6_insert_exception(nrt6, rt6))
+ dst_release_immediate(&nrt6->dst);
}
}
}
@@ -1571,7 +2142,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
int flags)
{
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
- struct rt6_info *rt;
+ struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
/* Get the "current" route for this destination and
@@ -1584,10 +2155,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
* routes.
*/
- read_lock_bh(&table->tb6_lock);
+ rcu_read_lock();
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
- for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ for_each_fib6_node_rt_rcu(fn) {
if (rt6_check_expired(rt))
continue;
if (rt->dst.error)
@@ -1596,8 +2167,23 @@ restart:
continue;
if (fl6->flowi6_oif != rt->dst.dev->ifindex)
continue;
- if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+ /* rt_cache's gateway might be different from its 'parent'
+ * in the case of an ip redirect.
+ * So we keep searching in the exception table if the gateway
+ * is different.
+ */
+ if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+ rt_cache = rt6_find_cached_rt(rt,
+ &fl6->daddr,
+ &fl6->saddr);
+ if (rt_cache &&
+ ipv6_addr_equal(&rdfl->gateway,
+ &rt_cache->rt6i_gateway)) {
+ rt = rt_cache;
+ break;
+ }
continue;
+ }
break;
}
@@ -1615,9 +2201,9 @@ restart:
}
out:
- dst_hold(&rt->dst);
+ ip6_hold_safe(net, &rt, true);
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
return rt;
@@ -1766,6 +2352,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
* do proper release of the net_device
*/
rt6_uncached_list_add(rt);
+ atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
@@ -2216,9 +2803,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
}
table = rt->rt6i_table;
- write_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&table->tb6_lock);
err = fib6_del(rt, info);
- write_unlock_bh(&table->tb6_lock);
+ spin_unlock_bh(&table->tb6_lock);
out:
ip6_rt_put(rt);
@@ -2244,7 +2831,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
if (rt == net->ipv6.ip6_null_entry)
goto out_put;
table = rt->rt6i_table;
- write_lock_bh(&table->tb6_lock);
+ spin_lock_bh(&table->tb6_lock);
if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
struct rt6_info *sibling, *next_sibling;
@@ -2274,7 +2861,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
err = fib6_del(rt, info);
out_unlock:
- write_unlock_bh(&table->tb6_lock);
+ spin_unlock_bh(&table->tb6_lock);
out_put:
ip6_rt_put(rt);
@@ -2288,9 +2875,9 @@ out_put:
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
+ struct rt6_info *rt, *rt_cache;
struct fib6_table *table;
struct fib6_node *fn;
- struct rt6_info *rt;
int err = -ESRCH;
table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
@@ -2299,17 +2886,22 @@ static int ip6_route_del(struct fib6_config *cfg,
return err;
}
- read_lock_bh(&table->tb6_lock);
+ rcu_read_lock();
fn = fib6_locate(&table->tb6_root,
&cfg->fc_dst, cfg->fc_dst_len,
- &cfg->fc_src, cfg->fc_src_len);
+ &cfg->fc_src, cfg->fc_src_len,
+ !(cfg->fc_flags & RTF_CACHE));
if (fn) {
- for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
- if ((rt->rt6i_flags & RTF_CACHE) &&
- !(cfg->fc_flags & RTF_CACHE))
- continue;
+ for_each_fib6_node_rt_rcu(fn) {
+ if (cfg->fc_flags & RTF_CACHE) {
+ rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
+ &cfg->fc_src);
+ if (!rt_cache)
+ continue;
+ rt = rt_cache;
+ }
if (cfg->fc_ifindex &&
(!rt->dst.dev ||
rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -2321,8 +2913,9 @@ static int ip6_route_del(struct fib6_config *cfg,
continue;
if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
continue;
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
+ if (!dst_hold_safe(&rt->dst))
+ break;
+ rcu_read_unlock();
/* if gateway was specified only delete the one hop */
if (cfg->fc_flags & RTF_GATEWAY)
@@ -2331,7 +2924,7 @@ static int ip6_route_del(struct fib6_config *cfg,
return __ip6_del_rt_siblings(rt, cfg);
}
}
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
return err;
}
@@ -2435,8 +3028,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
nrt->rt6i_protocol = RTPROT_REDIRECT;
nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
- if (ip6_ins_rt(nrt))
- goto out_release;
+ /* No need to remove rt from the exception table if rt is
+ * a cached route because rt6_insert_exception() will
+ * takes care of it
+ */
+ if (rt6_insert_exception(nrt, rt)) {
+ dst_release_immediate(&nrt->dst);
+ goto out;
+ }
netevent.old = &rt->dst;
netevent.new = &nrt->dst;
@@ -2444,17 +3043,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
netevent.neigh = neigh;
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
- if (rt->rt6i_flags & RTF_CACHE) {
- rt = (struct rt6_info *) dst_clone(&rt->dst);
- ip6_del_rt(rt);
- }
-
-out_release:
- /* Release the reference taken in
- * ip6_rt_cache_alloc()
- */
- dst_release(&nrt->dst);
-
out:
neigh_release(neigh);
}
@@ -2511,23 +3099,23 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
if (!table)
return NULL;
- read_lock_bh(&table->tb6_lock);
- fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
+ rcu_read_lock();
+ fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
if (!fn)
goto out;
- for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ for_each_fib6_node_rt_rcu(fn) {
if (rt->dst.dev->ifindex != ifindex)
continue;
if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
continue;
if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
continue;
- dst_hold(&rt->dst);
+ ip6_hold_safe(NULL, &rt, false);
break;
}
out:
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
return rt;
}
@@ -2573,16 +3161,16 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
if (!table)
return NULL;
- read_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+ rcu_read_lock();
+ for_each_fib6_node_rt_rcu(&table->tb6_root) {
if (dev == rt->dst.dev &&
((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
ipv6_addr_equal(&rt->rt6i_gateway, addr))
break;
}
if (rt)
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
+ ip6_hold_safe(NULL, &rt, false);
+ rcu_read_unlock();
return rt;
}
@@ -2620,17 +3208,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table)
struct rt6_info *rt;
restart:
- read_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+ rcu_read_lock();
+ for_each_fib6_node_rt_rcu(&table->tb6_root) {
if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
(!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- ip6_del_rt(rt);
+ if (dst_hold_safe(&rt->dst)) {
+ rcu_read_unlock();
+ ip6_del_rt(rt);
+ } else {
+ rcu_read_unlock();
+ }
goto restart;
}
}
- read_unlock_bh(&table->tb6_lock);
+ rcu_read_unlock();
table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}
@@ -2818,8 +3409,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
if (((void *)rt->dst.dev == dev || !dev) &&
rt != net->ipv6.ip6_null_entry &&
ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+ spin_lock_bh(&rt6_exception_lock);
/* remove prefsrc entry */
rt->rt6i_prefsrc.plen = 0;
+ /* need to update cache as well */
+ rt6_exceptions_remove_prefsrc(rt);
+ spin_unlock_bh(&rt6_exception_lock);
}
return 0;
}
@@ -2836,18 +3431,23 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
}
#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
-#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
- if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
- ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
- ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+ if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+ ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
return -1;
}
+
+ /* Further clean up cached routes in exception table.
+ * This is needed because cached route may have a different
+ * gateway than its 'parent' in the case of an ip redirect.
+ */
+ rt6_exceptions_clean_tohost(rt, gateway);
+
return 0;
}
@@ -2926,19 +3526,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
if (rt->dst.dev == arg->dev &&
dst_metric_raw(&rt->dst, RTAX_MTU) &&
!dst_metric_locked(&rt->dst, RTAX_MTU)) {
- if (rt->rt6i_flags & RTF_CACHE) {
- /* For RTF_CACHE with rt6i_pmtu == 0
- * (i.e. a redirected route),
- * the metrics of its rt->dst.from has already
- * been updated.
- */
- if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
- rt->rt6i_pmtu = arg->mtu;
- } else if (dst_mtu(&rt->dst) >= arg->mtu ||
- (dst_mtu(&rt->dst) < arg->mtu &&
- dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+ spin_lock_bh(&rt6_exception_lock);
+ if (dst_mtu(&rt->dst) >= arg->mtu ||
+ (dst_mtu(&rt->dst) < arg->mtu &&
+ dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
}
+ rt6_exceptions_update_pmtu(rt, arg->mtu);
+ spin_unlock_bh(&rt6_exception_lock);
}
return 0;
}
@@ -3839,7 +4434,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
net->ipv6.rt6_stats->fib_nodes,
net->ipv6.rt6_stats->fib_route_nodes,
- net->ipv6.rt6_stats->fib_rt_alloc,
+ atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
net->ipv6.rt6_stats->fib_rt_entries,
net->ipv6.rt6_stats->fib_rt_cache,
dst_entries_get_slow(&net->ipv6.ip6_dst_ops),