From 83723d60717f8da0f53f91cf42a845ed56c09662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Jan 2011 20:11:38 +0100 Subject: netfilter: x_tables: dont block BH while reading counters Using "iptables -L" with a lot of rules have a too big BH latency. Jesper mentioned ~6 ms and worried of frame drops. Switch to a per_cpu seqlock scheme, so that taking a snapshot of counters doesnt need to block BH (for this cpu, but also other cpus). This adds two increments on seqlock sequence per ipt_do_table() call, its a reasonable cost for allowing "iptables -L" not block BH processing. Reported-by: Jesper Dangaard Brouer Signed-off-by: Eric Dumazet CC: Patrick McHardy Acked-by: Stephen Hemminger Acked-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 45 +++++++++++++---------------------------- net/ipv4/netfilter/ip_tables.c | 45 +++++++++++++---------------------------- net/ipv6/netfilter/ip6_tables.c | 45 +++++++++++++---------------------------- net/netfilter/x_tables.c | 3 ++- 4 files changed, 44 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3fac340a28d5..e855fffaed95 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -710,42 +710,25 @@ static void get_counters(const struct xt_table_info *t, struct arpt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu = get_cpu(); - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - i = 0; - xt_entry_foreach(iter, t->entries[curcpu], t->size) { - SET_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); - ++i; - } - local_bh_enable(); - /* Processing counters from other cpus, we can let bottom half enabled, - * (preemption is disabled) - */ for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + i = 0; - local_bh_disable(); - xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { - ADD_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqbegin(lock); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqretry(lock, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); ++i; } - xt_info_wrunlock(cpu); - local_bh_enable(); } - put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -759,7 +742,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) * about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1007,7 +990,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a846d633b3b6..652efea013dc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -884,42 +884,25 @@ get_counters(const struct xt_table_info *t, struct ipt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu = get_cpu(); - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU. - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - i = 0; - xt_entry_foreach(iter, t->entries[curcpu], t->size) { - SET_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); - ++i; - } - local_bh_enable(); - /* Processing counters from other cpus, we can let bottom half enabled, - * (preemption is disabled) - */ for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + i = 0; - local_bh_disable(); - xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { - ADD_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqbegin(lock); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqretry(lock, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ } - xt_info_wrunlock(cpu); - local_bh_enable(); } - put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -932,7 +915,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ipt_entry *iter; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 455582384ece..7d227c644f72 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -897,42 +897,25 @@ get_counters(const struct xt_table_info *t, struct ip6t_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu = get_cpu(); - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - i = 0; - xt_entry_foreach(iter, t->entries[curcpu], t->size) { - SET_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); - ++i; - } - local_bh_enable(); - /* Processing counters from other cpus, we can let bottom half enabled, - * (preemption is disabled) - */ for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + i = 0; - local_bh_disable(); - xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { - ADD_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqbegin(lock); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqretry(lock, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); ++i; } - xt_info_wrunlock(cpu); - local_bh_enable(); } - put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -945,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1216,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ip6t_entry *iter; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 80463507420e..c94237631077 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1325,7 +1325,8 @@ static int __init xt_init(void) for_each_possible_cpu(i) { struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); - spin_lock_init(&lock->lock); + + seqlock_init(&lock->lock); lock->readers = 0; } -- cgit v1.2.3 From 13ee6ac579574a2a95e982b19920fd2495dce8cd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 11 Jan 2011 23:54:42 +0100 Subject: netfilter: fix race in conntrack between dump_table and destroy The netlink interface to dump the connection tracking table has a race when entries are deleted at the same time. A customer reported a crash and the backtrace showed thatctnetlink_dump_table was running while a conntrack entry was being destroyed. (see https://bugzilla.vyatta.com/show_bug.cgi?id=6402). According to RCU documentation, when using hlist_nulls the reader must handle the case of seeing a deleted entry and not proceed further down the linked list. The old code would continue which caused the scan to walk into the free list. This patch uses locking (rather than RCU) for this operation which is guaranteed safe, and no longer requires getting reference while doing dump operation. Signed-off-by: Stephen Hemminger Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 746140264b2d..5cb8d3027b18 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -645,25 +645,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; - rcu_read_lock(); + spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[cb->args[0]], + hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (!atomic_inc_not_zero(&ct->ct_general.use)) - continue; /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ if (l3proto && nf_ct_l3num(ct) != l3proto) - goto releasect; + continue; if (cb->args[1]) { if (ct != last) - goto releasect; + continue; cb->args[1] = 0; } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, @@ -681,8 +679,6 @@ restart: if (acct) memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX])); } -releasect: - nf_ct_put(ct); } if (cb->args[1]) { cb->args[1] = 0; @@ -690,7 +686,7 @@ releasect: } } out: - rcu_read_unlock(); + spin_unlock_bh(&nf_conntrack_lock); if (last) nf_ct_put(last); -- cgit v1.2.3