summaryrefslogtreecommitdiff
path: root/net/openvswitch/flow_table.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-08-06 06:13:21 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-08-06 06:13:21 +0300
commit47ec5303d73ea344e84f46660fff693c57641386 (patch)
treea2252debab749de29620c43285295d60c4741119 /net/openvswitch/flow_table.c
parent8186749621ed6b8fc42644c399e8c755a2b6f630 (diff)
parentc1055b76ad00aed0e8b79417080f212d736246b6 (diff)
downloadlinux-47ec5303d73ea344e84f46660fff693c57641386.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller: 1) Support 6Ghz band in ath11k driver, from Rajkumar Manoharan. 2) Support UDP segmentation in code TSO code, from Eric Dumazet. 3) Allow flashing different flash images in cxgb4 driver, from Vishal Kulkarni. 4) Add drop frames counter and flow status to tc flower offloading, from Po Liu. 5) Support n-tuple filters in cxgb4, from Vishal Kulkarni. 6) Various new indirect call avoidance, from Eric Dumazet and Brian Vazquez. 7) Fix BPF verifier failures on 32-bit pointer arithmetic, from Yonghong Song. 8) Support querying and setting hardware address of a port function via devlink, use this in mlx5, from Parav Pandit. 9) Support hw ipsec offload on bonding slaves, from Jarod Wilson. 10) Switch qca8k driver over to phylink, from Jonathan McDowell. 11) In bpftool, show list of processes holding BPF FD references to maps, programs, links, and btf objects. From Andrii Nakryiko. 12) Several conversions over to generic power management, from Vaibhav Gupta. 13) Add support for SO_KEEPALIVE et al. to bpf_setsockopt(), from Dmitry Yakunin. 14) Various https url conversions, from Alexander A. Klimov. 15) Timestamping and PHC support for mscc PHY driver, from Antoine Tenart. 16) Support bpf iterating over tcp and udp sockets, from Yonghong Song. 17) Support 5GBASE-T i40e NICs, from Aleksandr Loktionov. 18) Add kTLS RX HW offload support to mlx5e, from Tariq Toukan. 19) Fix the ->ndo_start_xmit() return type to be netdev_tx_t in several drivers. From Luc Van Oostenryck. 20) XDP support for xen-netfront, from Denis Kirjanov. 21) Support receive buffer autotuning in MPTCP, from Florian Westphal. 22) Support EF100 chip in sfc driver, from Edward Cree. 23) Add XDP support to mvpp2 driver, from Matteo Croce. 24) Support MPTCP in sock_diag, from Paolo Abeni. 25) Commonize UDP tunnel offloading code by creating udp_tunnel_nic infrastructure, from Jakub Kicinski. 26) Several pci_ --> dma_ API conversions, from Christophe JAILLET. 27) Add FLOW_ACTION_POLICE support to mlxsw, from Ido Schimmel. 28) Add SK_LOOKUP bpf program type, from Jakub Sitnicki. 29) Refactor a lot of networking socket option handling code in order to avoid set_fs() calls, from Christoph Hellwig. 30) Add rfc4884 support to icmp code, from Willem de Bruijn. 31) Support TBF offload in dpaa2-eth driver, from Ioana Ciornei. 32) Support XDP_REDIRECT in qede driver, from Alexander Lobakin. 33) Support PCI relaxed ordering in mlx5 driver, from Aya Levin. 34) Support TCP syncookies in MPTCP, from Flowian Westphal. 35) Fix several tricky cases of PMTU handling wrt. briding, from Stefano Brivio. * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2056 commits) net: thunderx: initialize VF's mailbox mutex before first usage usb: hso: remove bogus check for EINPROGRESS usb: hso: no complaint about kmalloc failure hso: fix bailout in error case of probe ip_tunnel_core: Fix build for archs without _HAVE_ARCH_IPV6_CSUM selftests/net: relax cpu affinity requirement in msg_zerocopy test mptcp: be careful on subflow creation selftests: rtnetlink: make kci_test_encap() return sub-test result selftests: rtnetlink: correct the final return value for the test net: dsa: sja1105: use detected device id instead of DT one on mismatch tipc: set ub->ifindex for local ipv6 address ipv6: add ipv6_dev_find() net: openvswitch: silence suspicious RCU usage warning Revert "vxlan: fix tos value before xmit" ptp: only allow phase values lower than 1 period farsync: switch from 'pci_' to 'dma_' API wan: wanxl: switch from 'pci_' to 'dma_' API hv_netvsc: do not use VF device if link is down dpaa2-eth: Fix passing zero to 'PTR_ERR' warning net: macb: Properly handle phylink on at91sam9x ...
Diffstat (limited to 'net/openvswitch/flow_table.c')
-rw-r--r--net/openvswitch/flow_table.c287
1 files changed, 267 insertions, 20 deletions
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 2398d7238300..8c12675cbb67 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -29,6 +29,7 @@
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
+#include <linux/sort.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
@@ -37,8 +38,8 @@
#define MASK_ARRAY_SIZE_MIN 16
#define REHASH_INTERVAL (10 * 60 * HZ)
+#define MC_DEFAULT_HASH_ENTRIES 256
#define MC_HASH_SHIFT 8
-#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT)
#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
static struct kmem_cache *flow_cache;
@@ -169,16 +170,70 @@ static struct table_instance *table_instance_alloc(int new_size)
return ti;
}
+static void __mask_array_destroy(struct mask_array *ma)
+{
+ free_percpu(ma->masks_usage_cntr);
+ kfree(ma);
+}
+
+static void mask_array_rcu_cb(struct rcu_head *rcu)
+{
+ struct mask_array *ma = container_of(rcu, struct mask_array, rcu);
+
+ __mask_array_destroy(ma);
+}
+
+static void tbl_mask_array_reset_counters(struct mask_array *ma)
+{
+ int i, cpu;
+
+ /* As the per CPU counters are not atomic we can not go ahead and
+ * reset them from another CPU. To be able to still have an approximate
+ * zero based counter we store the value at reset, and subtract it
+ * later when processing.
+ */
+ for (i = 0; i < ma->max; i++) {
+ ma->masks_usage_zero_cntr[i] = 0;
+
+ for_each_possible_cpu(cpu) {
+ u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
+ cpu);
+ unsigned int start;
+ u64 counter;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&ma->syncp);
+ counter = usage_counters[i];
+ } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+
+ ma->masks_usage_zero_cntr[i] += counter;
+ }
+ }
+}
+
static struct mask_array *tbl_mask_array_alloc(int size)
{
struct mask_array *new;
size = max(MASK_ARRAY_SIZE_MIN, size);
new = kzalloc(sizeof(struct mask_array) +
- sizeof(struct sw_flow_mask *) * size, GFP_KERNEL);
+ sizeof(struct sw_flow_mask *) * size +
+ sizeof(u64) * size, GFP_KERNEL);
if (!new)
return NULL;
+ new->masks_usage_zero_cntr = (u64 *)((u8 *)new +
+ sizeof(struct mask_array) +
+ sizeof(struct sw_flow_mask *) *
+ size);
+
+ new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
+ __alignof__(u64));
+ if (!new->masks_usage_cntr) {
+ kfree(new);
+ return NULL;
+ }
+
new->count = 0;
new->max = size;
@@ -202,10 +257,10 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
if (ovsl_dereference(old->masks[i]))
new->masks[new->count++] = old->masks[i];
}
+ call_rcu(&old->rcu, mask_array_rcu_cb);
}
rcu_assign_pointer(tbl->mask_array, new);
- kfree_rcu(old, rcu);
return 0;
}
@@ -223,6 +278,11 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
return err;
ma = ovsl_dereference(tbl->mask_array);
+ } else {
+ /* On every add or delete we need to reset the counters so
+ * every new mask gets a fair chance of being prioritized.
+ */
+ tbl_mask_array_reset_counters(ma);
}
BUG_ON(ovsl_dereference(ma->masks[ma_count]));
@@ -260,6 +320,9 @@ found:
if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
ma_count <= (ma->max / 3))
tbl_mask_array_realloc(tbl, ma->max / 2);
+ else
+ tbl_mask_array_reset_counters(ma);
+
}
/* Remove 'mask' from the mask list, if it is not needed any more. */
@@ -278,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
}
}
+static void __mask_cache_destroy(struct mask_cache *mc)
+{
+ free_percpu(mc->mask_cache);
+ kfree(mc);
+}
+
+static void mask_cache_rcu_cb(struct rcu_head *rcu)
+{
+ struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu);
+
+ __mask_cache_destroy(mc);
+}
+
+static struct mask_cache *tbl_mask_cache_alloc(u32 size)
+{
+ struct mask_cache_entry __percpu *cache = NULL;
+ struct mask_cache *new;
+
+ /* Only allow size to be 0, or a power of 2, and does not exceed
+ * percpu allocation size.
+ */
+ if ((!is_power_of_2(size) && size != 0) ||
+ (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
+ return NULL;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->cache_size = size;
+ if (new->cache_size > 0) {
+ cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry),
+ new->cache_size),
+ __alignof__(struct mask_cache_entry));
+ if (!cache) {
+ kfree(new);
+ return NULL;
+ }
+ }
+
+ new->mask_cache = cache;
+ return new;
+}
+int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
+{
+ struct mask_cache *mc = rcu_dereference(table->mask_cache);
+ struct mask_cache *new;
+
+ if (size == mc->cache_size)
+ return 0;
+
+ if ((!is_power_of_2(size) && size != 0) ||
+ (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
+ return -EINVAL;
+
+ new = tbl_mask_cache_alloc(size);
+ if (!new)
+ return -ENOMEM;
+
+ rcu_assign_pointer(table->mask_cache, new);
+ call_rcu(&mc->rcu, mask_cache_rcu_cb);
+
+ return 0;
+}
+
int ovs_flow_tbl_init(struct flow_table *table)
{
struct table_instance *ti, *ufid_ti;
+ struct mask_cache *mc;
struct mask_array *ma;
- table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) *
- MC_HASH_ENTRIES,
- __alignof__(struct mask_cache_entry));
- if (!table->mask_cache)
+ mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
+ if (!mc)
return -ENOMEM;
ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
@@ -304,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table)
rcu_assign_pointer(table->ti, ti);
rcu_assign_pointer(table->ufid_ti, ufid_ti);
rcu_assign_pointer(table->mask_array, ma);
+ rcu_assign_pointer(table->mask_cache, mc);
table->last_rehash = jiffies;
table->count = 0;
table->ufid_count = 0;
@@ -312,9 +440,9 @@ int ovs_flow_tbl_init(struct flow_table *table)
free_ti:
__table_instance_destroy(ti);
free_mask_array:
- kfree(ma);
+ __mask_array_destroy(ma);
free_mask_cache:
- free_percpu(table->mask_cache);
+ __mask_cache_destroy(mc);
return -ENOMEM;
}
@@ -390,9 +518,11 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
{
struct table_instance *ti = rcu_dereference_raw(table->ti);
struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
+ struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
+ struct mask_array *ma = rcu_dereference_raw(table->mask_array);
- free_percpu(table->mask_cache);
- kfree_rcu(rcu_dereference_raw(table->mask_array), rcu);
+ call_rcu(&mc->rcu, mask_cache_rcu_cb);
+ call_rcu(&ma->rcu, mask_array_rcu_cb);
table_instance_destroy(table, ti, ufid_ti, false);
}
@@ -604,8 +734,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct mask_array *ma,
const struct sw_flow_key *key,
u32 *n_mask_hit,
+ u32 *n_cache_hit,
u32 *index)
{
+ u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
struct sw_flow *flow;
struct sw_flow_mask *mask;
int i;
@@ -614,8 +746,13 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
mask = rcu_dereference_ovsl(ma->masks[*index]);
if (mask) {
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
- if (flow)
+ if (flow) {
+ u64_stats_update_begin(&ma->syncp);
+ usage_counters[*index]++;
+ u64_stats_update_end(&ma->syncp);
+ (*n_cache_hit)++;
return flow;
+ }
}
}
@@ -631,6 +768,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) { /* Found */
*index = i;
+ u64_stats_update_begin(&ma->syncp);
+ usage_counters[*index]++;
+ u64_stats_update_end(&ma->syncp);
return flow;
}
}
@@ -648,8 +788,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
const struct sw_flow_key *key,
u32 skb_hash,
- u32 *n_mask_hit)
+ u32 *n_mask_hit,
+ u32 *n_cache_hit)
{
+ struct mask_cache *mc = rcu_dereference(tbl->mask_cache);
struct mask_array *ma = rcu_dereference(tbl->mask_array);
struct table_instance *ti = rcu_dereference(tbl->ti);
struct mask_cache_entry *entries, *ce;
@@ -658,10 +800,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
int seg;
*n_mask_hit = 0;
- if (unlikely(!skb_hash)) {
+ *n_cache_hit = 0;
+ if (unlikely(!skb_hash || mc->cache_size == 0)) {
u32 mask_index = 0;
+ u32 cache = 0;
- return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
+ return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache,
+ &mask_index);
}
/* Pre and post recirulation flows usually have the same skb_hash
@@ -672,17 +817,17 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
ce = NULL;
hash = skb_hash;
- entries = this_cpu_ptr(tbl->mask_cache);
+ entries = this_cpu_ptr(mc->mask_cache);
/* Find the cache entry 'ce' to operate on. */
for (seg = 0; seg < MC_HASH_SEGS; seg++) {
- int index = hash & (MC_HASH_ENTRIES - 1);
+ int index = hash & (mc->cache_size - 1);
struct mask_cache_entry *e;
e = &entries[index];
if (e->skb_hash == skb_hash) {
flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
- &e->mask_index);
+ n_cache_hit, &e->mask_index);
if (!flow)
e->skb_hash = 0;
return flow;
@@ -695,10 +840,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
}
/* Cache miss, do full lookup. */
- flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit,
+ &ce->mask_index);
if (flow)
ce->skb_hash = skb_hash;
+ *n_cache_hit = 0;
return flow;
}
@@ -708,9 +855,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
u32 __always_unused n_mask_hit;
+ u32 __always_unused n_cache_hit;
u32 index = 0;
- return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index);
+ return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -787,6 +935,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table)
return READ_ONCE(ma->count);
}
+u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
+{
+ struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
+
+ return READ_ONCE(mc->cache_size);
+}
+
static struct table_instance *table_instance_expand(struct table_instance *ti,
bool ufid)
{
@@ -934,6 +1089,98 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
return 0;
}
+static int compare_mask_and_count(const void *a, const void *b)
+{
+ const struct mask_count *mc_a = a;
+ const struct mask_count *mc_b = b;
+
+ return (s64)mc_b->counter - (s64)mc_a->counter;
+}
+
+/* Must be called with OVS mutex held. */
+void ovs_flow_masks_rebalance(struct flow_table *table)
+{
+ struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
+ struct mask_count *masks_and_count;
+ struct mask_array *new;
+ int masks_entries = 0;
+ int i;
+
+ /* Build array of all current entries with use counters. */
+ masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count),
+ GFP_KERNEL);
+ if (!masks_and_count)
+ return;
+
+ for (i = 0; i < ma->max; i++) {
+ struct sw_flow_mask *mask;
+ unsigned int start;
+ int cpu;
+
+ mask = rcu_dereference_ovsl(ma->masks[i]);
+ if (unlikely(!mask))
+ break;
+
+ masks_and_count[i].index = i;
+ masks_and_count[i].counter = 0;
+
+ for_each_possible_cpu(cpu) {
+ u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
+ cpu);
+ u64 counter;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&ma->syncp);
+ counter = usage_counters[i];
+ } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+
+ masks_and_count[i].counter += counter;
+ }
+
+ /* Subtract the zero count value. */
+ masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i];
+
+ /* Rather than calling tbl_mask_array_reset_counters()
+ * below when no change is needed, do it inline here.
+ */
+ ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter;
+ }
+
+ if (i == 0)
+ goto free_mask_entries;
+
+ /* Sort the entries */
+ masks_entries = i;
+ sort(masks_and_count, masks_entries, sizeof(*masks_and_count),
+ compare_mask_and_count, NULL);
+
+ /* If the order is the same, nothing to do... */
+ for (i = 0; i < masks_entries; i++) {
+ if (i != masks_and_count[i].index)
+ break;
+ }
+ if (i == masks_entries)
+ goto free_mask_entries;
+
+ /* Rebuilt the new list in order of usage. */
+ new = tbl_mask_array_alloc(ma->max);
+ if (!new)
+ goto free_mask_entries;
+
+ for (i = 0; i < masks_entries; i++) {
+ int index = masks_and_count[i].index;
+
+ if (ovsl_dereference(ma->masks[index]))
+ new->masks[new->count++] = ma->masks[index];
+ }
+
+ rcu_assign_pointer(table->mask_array, new);
+ call_rcu(&ma->rcu, mask_array_rcu_cb);
+
+free_mask_entries:
+ kfree(masks_and_count);
+}
+
/* Initializes the flow module.
* Returns zero if successful or a negative error code. */
int ovs_flow_init(void)