diff options
Diffstat (limited to 'net/netfilter')
31 files changed, 543 insertions, 201 deletions
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 9a6b64779e64..0b68e2e2824e 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -739,9 +739,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 271da8447b29..2a3017b9c001 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -44,7 +44,8 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - range 8 20 + range 8 20 if !64BIT + range 8 27 if 64BIT default 12 help The IPVS connection hash table uses the chaining scheme to handle @@ -54,24 +55,24 @@ config IP_VS_TAB_BITS Note the table size must be power of 2. The table size will be the value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). + from 8 to 27 for 64BIT(20 otherwise), the default number is 12, + which means the table size is 4096. Don't input the number too + small, otherwise you will lose performance on it. You can adapt the + table size yourself, according to your virtual server application. + It is good to set the table size not far less than the number of + connections per second multiplying average lasting time of + connection in the table. For example, your virtual server gets 200 + connections per second, the connection lasts for 200 seconds in + average in the connection table, the table size should be not far + less than 200x200, it is good to set the table size 32768 (2**15). Another note that each connection occupies 128 bytes effectively and each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. You can overwrite this number setting conn_tab_bits module parameter - or by appending ip_vs.conn_tab_bits=? to the kernel command line - if IP VS was compiled built-in. + or by appending ip_vs.conn_tab_bits=? to the kernel command line if + IP VS was compiled built-in. comment "IPVS transport protocol load balancing support" diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 928e64653837..9065da3cdd12 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -26,7 +26,6 @@ #include <linux/net.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ #include <linux/slab.h> #include <linux/seq_file.h> @@ -1482,13 +1481,21 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_conn_init(void) { size_t tab_array_size; + int max_avail; +#if BITS_PER_LONG > 32 + int max = 27; +#else + int max = 20; +#endif + int min = 8; int idx; - /* Compute size and mask */ - if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { - pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); - ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; - } + max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; + max_avail -= 2; /* ~4 in hash row */ + max_avail -= 1; /* IPVS up to 1/2 of mem */ + max_avail -= order_base_2(sizeof(struct ip_vs_conn)); + max = clamp(max, min, max_avail); + ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; @@ -1497,7 +1504,8 @@ int __init ip_vs_conn_init(void) */ tab_array_size = array_size(ip_vs_conn_tab_size, sizeof(*ip_vs_conn_tab)); - ip_vs_conn_tab = vmalloc(tab_array_size); + ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab), GFP_KERNEL); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1506,7 +1514,7 @@ int __init ip_vs_conn_init(void) sizeof(struct ip_vs_conn), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); return -ENOMEM; } @@ -1534,5 +1542,5 @@ void ip_vs_conn_cleanup(void) rcu_barrier(); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - vfree(ip_vs_conn_tab); + kvfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index a80b960223e1..9193e109e6b3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -139,7 +139,7 @@ retry: if (PTR_ERR(rt) == -EINVAL && *saddr && rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { *saddr = 0; - flowi4_update_output(&fl4, 0, 0, daddr, 0); + flowi4_update_output(&fl4, 0, daddr, 0); goto retry; } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); @@ -147,7 +147,7 @@ retry: } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + flowi4_update_output(&fl4, 0, daddr, fl4.saddr); loop = true; goto retry; } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index c1557d47ccd1..d4fd626d2b8c 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -432,9 +432,19 @@ static bool dccp_error(const struct dccp_hdr *dh, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { + static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST | + 1 << DCCP_PKT_RESPONSE | + 1 << DCCP_PKT_CLOSEREQ | + 1 << DCCP_PKT_CLOSE | + 1 << DCCP_PKT_RESET | + 1 << DCCP_PKT_SYNC | + 1 << DCCP_PKT_SYNCACK; unsigned int dccp_len = skb->len - dataoff; unsigned int cscov; const char *msg; + u8 type; + + BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG); if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || dh->dccph_doff * 4 > dccp_len) { @@ -459,34 +469,70 @@ static bool dccp_error(const struct dccp_hdr *dh, goto out_invalid; } - if (dh->dccph_type >= DCCP_PKT_INVALID) { + type = dh->dccph_type; + if (type >= DCCP_PKT_INVALID) { msg = "nf_ct_dccp: reserved packet type "; goto out_invalid; } + + if (test_bit(type, &require_seq48) && !dh->dccph_x) { + msg = "nf_ct_dccp: type lacks 48bit sequence numbers"; + goto out_invalid; + } + return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); return true; } +struct nf_conntrack_dccp_buf { + struct dccp_hdr dh; /* generic header part */ + struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */ + union { /* depends on header type */ + struct dccp_hdr_ack_bits ack; + struct dccp_hdr_request req; + struct dccp_hdr_response response; + struct dccp_hdr_reset rst; + } u; +}; + +static struct dccp_hdr * +dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh, + struct nf_conntrack_dccp_buf *buf) +{ + unsigned int hdrlen = __dccp_hdr_len(dh); + + if (hdrlen > sizeof(*buf)) + return NULL; + + return skb_header_pointer(skb, offset, hdrlen, buf); +} + int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct dccp_hdr _dh, *dh; + struct nf_conntrack_dccp_buf _dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; unsigned int *timeouts; + struct dccp_hdr *dh; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); + dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) return NF_DROP; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; + /* pull again, including possible 48 bit sequences and subtype header */ + dh = dccp_header_pointer(skb, dataoff, dh, &_dh); + if (!dh) + return NF_DROP; + type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 728eeb0aea87..ad6f0ca40cd2 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -296,6 +296,7 @@ void nf_conntrack_gre_init_net(struct net *net) /* protocol helper struct */ const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 77f5e82d8e3f..d0eac27f6ba0 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -611,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, start += strlen(name); *val = simple_strtoul(start, &end, 0); if (start == end) - return 0; + return -1; if (matchoff && matchlen) { *matchoff = start - dptr; *matchlen = end - start; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index b0ef48b21dcb..1d34d700bd09 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -125,9 +125,6 @@ static int flow_offload_fill_route(struct flow_offload *flow, break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: - if (!dst_hold_safe(route->tuple[dir].dst)) - return -1; - flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; @@ -148,27 +145,12 @@ static void nft_flow_dst_release(struct flow_offload *flow, dst_release(flow->tuplehash[dir].tuple.dst_cache); } -int flow_offload_route_init(struct flow_offload *flow, +void flow_offload_route_init(struct flow_offload *flow, const struct nf_flow_route *route) { - int err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); - if (err < 0) - return err; - - err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); - if (err < 0) - goto err_route_reply; - + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; - - return 0; - -err_route_reply: - nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); - - return err; } EXPORT_SYMBOL_GPL(flow_offload_route_init); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 3bbaf9c7ea46..e45fade76409 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,6 +8,7 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_ether.h> +#include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -163,38 +164,43 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, } } -static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize, - u32 offset) +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; +}; + +static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, + struct flow_offload_tuple *tuple) { struct flow_ports *ports; unsigned int thoff; struct iphdr *iph; u8 ipproto; - if (!pskb_may_pull(skb, sizeof(*iph) + offset)) + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return -1; - iph = (struct iphdr *)(skb_network_header(skb) + offset); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; - thoff += offset; + thoff += ctx->offset; ipproto = iph->protocol; switch (ipproto) { case IPPROTO_TCP: - *hdrsize = sizeof(struct tcphdr); + ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - *hdrsize = sizeof(struct udphdr); + ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: - *hdrsize = sizeof(struct gre_base_hdr); + ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: @@ -204,7 +210,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (iph->ttl <= 1) return -1; - if (!pskb_may_pull(skb, thoff + *hdrsize)) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (ipproto) { @@ -224,13 +230,13 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, } } - iph = (struct iphdr *)(skb_network_header(skb) + offset); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr; tuple->dst_v4.s_addr = iph->daddr; tuple->l3proto = AF_INET; tuple->l4proto = ipproto; - tuple->iifidx = dev->ifindex; + tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; @@ -336,58 +342,56 @@ static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, return NF_STOLEN; } -unsigned int -nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static struct flow_offload_tuple_rhash * +nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - u32 hdrsize, offset = 0; - unsigned int thoff, mtu; - struct rtable *rt; - struct iphdr *iph; - __be32 nexthop; - int ret; if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset)) - return NF_ACCEPT; + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + return NULL; - if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0) - return NF_ACCEPT; + if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) + return NULL; - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; + return flow_offload_lookup(flow_table, &tuple); +} + +static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) +{ + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + unsigned int thoff, mtu; + struct iphdr *iph; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - mtu = flow->tuplehash[dir].tuple.mtu + offset; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) - return NF_ACCEPT; + return 0; - iph = (struct iphdr *)(skb_network_header(skb) + offset); - thoff = (iph->ihl * 4) + offset; + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); + thoff = (iph->ihl * 4) + ctx->offset; if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) - return NF_ACCEPT; + return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, thoff + hdrsize)) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); - thoff -= offset; + thoff -= ctx->offset; iph = ip_hdr(skb); nf_flow_nat_ip(flow, skb, thoff, dir, iph); @@ -398,6 +402,35 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + return 1; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + struct flow_offload *flow; + struct net_device *outdev; + struct rtable *rt; + __be32 nexthop; + int ret; + + tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); + if (!tuplehash) + return NF_ACCEPT; + + ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { rt = (struct rtable *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet_skb_parm)); @@ -406,6 +439,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = (struct rtable *)tuplehash->tuple.dst_cache; @@ -535,32 +571,31 @@ static void nf_flow_nat_ipv6(const struct flow_offload *flow, } } -static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize, - u32 offset) +static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, + struct flow_offload_tuple *tuple) { struct flow_ports *ports; struct ipv6hdr *ip6h; unsigned int thoff; u8 nexthdr; - thoff = sizeof(*ip6h) + offset; + thoff = sizeof(*ip6h) + ctx->offset; if (!pskb_may_pull(skb, thoff)) return -1; - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); nexthdr = ip6h->nexthdr; switch (nexthdr) { case IPPROTO_TCP: - *hdrsize = sizeof(struct tcphdr); + ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - *hdrsize = sizeof(struct udphdr); + ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: - *hdrsize = sizeof(struct gre_base_hdr); + ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: @@ -570,7 +605,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (ip6h->hop_limit <= 1) return -1; - if (!pskb_may_pull(skb, thoff + *hdrsize)) + if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (nexthdr) { @@ -590,65 +625,47 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, } } - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v6 = ip6h->saddr; tuple->dst_v6 = ip6h->daddr; tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; - tuple->iifidx = dev->ifindex; + tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; } -unsigned int -nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct flow_offload_tuple_rhash *tuplehash, + struct sk_buff *skb) { - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; enum flow_offload_tuple_dir dir; - const struct in6_addr *nexthop; struct flow_offload *flow; - struct net_device *outdev; unsigned int thoff, mtu; - u32 hdrsize, offset = 0; struct ipv6hdr *ip6h; - struct rt6_info *rt; - int ret; - - if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset)) - return NF_ACCEPT; - - if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - mtu = flow->tuplehash[dir].tuple.mtu + offset; + mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) - return NF_ACCEPT; + return 0; - ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); - thoff = sizeof(*ip6h) + offset; + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); + thoff = sizeof(*ip6h) + ctx->offset; if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) - return NF_ACCEPT; + return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); - return NF_ACCEPT; + return 0; } - if (skb_try_make_writable(skb, thoff + hdrsize)) - return NF_DROP; + if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + return -1; flow_offload_refresh(flow_table, flow, false); @@ -663,6 +680,52 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + return 1; +} + +static struct flow_offload_tuple_rhash * +nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, + struct nf_flowtable *flow_table, + struct sk_buff *skb) +{ + struct flow_offload_tuple tuple = {}; + + if (skb->protocol != htons(ETH_P_IPV6) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + return NULL; + + if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) + return NULL; + + return flow_offload_lookup(flow_table, &tuple); +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + enum flow_offload_tuple_dir dir; + struct nf_flowtable_ctx ctx = { + .in = state->in, + }; + const struct in6_addr *nexthop; + struct flow_offload *flow; + struct net_device *outdev; + struct rt6_info *rt; + int ret; + + tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb); + if (tuplehash == NULL) + return NF_ACCEPT; + + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { rt = (struct rt6_info *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); @@ -671,6 +734,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = (struct rt6_info *)tuplehash->tuple.dst_cache; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ce829d434f13..fadbd4ed3dc0 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -27,6 +27,9 @@ #include "nf_internals.h" +#define NF_NAT_MAX_ATTEMPTS 128 +#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) + static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); @@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } +static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) +{ + static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | + IPS_DYING; + static const unsigned long flags_needed = IPS_SRC_NAT; + enum tcp_conntrack old_state; + + old_state = READ_ONCE(ct->proto.tcp.state); + if (old_state < TCP_CONNTRACK_TIME_WAIT) + return false; + + if (flags & flags_refuse) + return false; + + return (flags & flags_needed) == flags_needed; +} + +/* reverse direction will send packets to new source, so + * make sure such packets are invalid. + */ +static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) +{ + return (__s32)(new->proto.tcp.seen[0].td_end - + old->proto.tcp.seen[0].td_end) > 0; +} + +static int +nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack, + unsigned int attempts_left) +{ + static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; + struct nf_conntrack_tuple_hash *thash; + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple reply; + unsigned long flags; + struct nf_conn *ct; + bool taken = true; + struct net *net; + + nf_ct_invert_tuple(&reply, tuple); + + if (attempts_left > NF_NAT_HARDER_THRESH || + tuple->dst.protonum != IPPROTO_TCP || + ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); + + /* :ast few attempts to find a free tcp port. Destructive + * action: evict colliding if its in timewait state and the + * tcp sequence number has advanced past the one used by the + * old entry. + */ + net = nf_ct_net(ignored_conntrack); + zone = nf_ct_zone(ignored_conntrack); + + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(thash); + + if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) + goto out; + + if (WARN_ON_ONCE(ct == ignored_conntrack)) + goto out; + + flags = READ_ONCE(ct->status); + if (!nf_nat_may_kill(ct, flags)) + goto out; + + if (!nf_seq_has_advanced(ct, ignored_conntrack)) + goto out; + + /* Even if we can evict do not reuse if entry is offloaded. */ + if (nf_ct_kill(ct)) + taken = flags & flags_offload; +out: + nf_ct_put(ct); + return taken; +} + static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { @@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; - static const unsigned int max_attempts = 128; switch (tuple->dst.protonum) { case IPPROTO_ICMP: @@ -471,8 +555,8 @@ find_free_id: off = get_random_u16(); attempts = range_size; - if (attempts > max_attempts) - attempts = max_attempts; + if (attempts > NF_NAT_MAX_ATTEMPTS) + attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. @@ -483,7 +567,7 @@ find_free_id: another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); - if (!nf_nat_used_tuple(tuple, ct)) + if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4c7937fd803f..9573a8fcad79 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -567,6 +567,7 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, nft_trans_set_update(trans) = true; nft_trans_set_gc_int(trans) = desc->gc_int; nft_trans_set_timeout(trans) = desc->timeout; + nft_trans_set_size(trans) = desc->size; } nft_trans_commit_list_add_tail(ctx->net, trans); @@ -5343,6 +5344,8 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, nft_set_trans_unbind(ctx, set); if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); + else + list_del_rcu(&binding->list); set->use--; break; @@ -5442,7 +5445,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_ext *ext) + const struct nft_set_ext *ext, + bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; @@ -5455,7 +5459,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; @@ -5466,7 +5470,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -5479,11 +5483,13 @@ nla_put_failure: static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + u64 timeout = 0; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) @@ -5506,7 +5512,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && - nft_set_elem_expr_dump(skb, set, ext)) + nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -5519,11 +5525,15 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + timeout = *nft_set_ext_timeout(ext); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + nf_jiffies64_to_msecs(timeout), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = READ_ONCE(set->timeout); + } if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { u64 expires, now = get_jiffies_64(); @@ -5538,6 +5548,9 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; + + if (reset) + *nft_set_ext_expiration(ext) = now + timeout; } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5561,6 +5574,7 @@ struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; + bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, @@ -5571,7 +5585,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set_dump_args *args; args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem); + return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } struct nft_set_dump_ctx { @@ -5580,7 +5594,7 @@ struct nft_set_dump_ctx { }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, - const struct nft_set *set) + const struct nft_set *set, bool reset) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); @@ -5595,7 +5609,7 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, continue; elem.priv = catchall->elem; - ret = nf_tables_fill_setelem(skb, set, &elem); + ret = nf_tables_fill_setelem(skb, set, &elem, reset); break; } @@ -5613,6 +5627,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; + bool reset = false; u32 portid, seq; int event; @@ -5660,8 +5675,12 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + args.cb = cb; args.skb = skb; + args.reset = reset; args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; @@ -5670,7 +5689,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) - args.iter.err = nft_set_catchall_dump(net, skb, set); + args.iter.err = nft_set_catchall_dump(net, skb, set, reset); rcu_read_unlock(); nla_nest_end(skb, nest); @@ -5708,7 +5727,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; @@ -5729,7 +5749,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem); + err = nf_tables_fill_setelem(skb, set, elem, reset); if (err < 0) goto nla_put_failure; @@ -5835,7 +5855,7 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, } static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; @@ -5879,7 +5899,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem); + NFT_MSG_NEWSETELEM, 0, set, &elem, + reset); if (err < 0) goto err_fill_setelem; @@ -5903,6 +5924,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; + bool reset = false; int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, @@ -5937,8 +5959,11 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr); + err = nft_get_set_elem(&ctx, set, attr, reset); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -5971,7 +5996,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem); + set, elem, false); if (err < 0) { kfree_skb(skb); goto err; @@ -6754,10 +6779,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size && - !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { - err = -ENFILE; - goto err_set_full; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX; + + if (!atomic_add_unless(&set->nelems, 1, max)) { + err = -ENFILE; + goto err_set_full; + } } nft_trans_elem(trans) = elem; @@ -6769,7 +6797,9 @@ err_set_full: err_element_clash: kfree(trans); err_elem_free: - nft_set_elem_destroy(set, elem.priv, true); + nf_tables_set_elem_destroy(ctx, set, elem.priv); + if (obj) + obj->use--; err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); @@ -8917,6 +8947,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETSETELEM_RESET] = { + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, @@ -9667,6 +9703,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans)); WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans)); + + if (nft_trans_set_size(trans)) + WRITE_ONCE(set->size, nft_trans_set_size(trans)); } else { nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index e311462f6d98..556bc902af00 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -30,6 +30,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> +#include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 2527a01486ef..ca857afbf061 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -86,7 +86,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, - [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index b66647a5a171..9a85e797ed58 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -88,9 +88,9 @@ void nft_byteorder_eval(const struct nft_expr *expr, static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_OP] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 }, - [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 }, + [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b9c84499438b..38958e067aa8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -332,7 +332,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, - [NFTA_CT_KEY] = { .type = NLA_U32 }, + [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index bd19c7aec92e..4fb34d76dbea 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -148,7 +148,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, - [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a54a7f772cec..7f856ceb3a66 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,6 +10,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/dccp.h> #include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -406,13 +407,89 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_dccp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int thoff, dataoff, optoff, optlen, i; + u32 *dest = ®s->data[priv->dreg]; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) + goto err; + + thoff = nft_thoff(pkt); + + dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); + if (!dh) + goto err; + + dataoff = dh->dccph_doff * sizeof(u32); + optoff = __dccp_hdr_len(dh); + if (dataoff <= optoff) + goto err; + + optlen = dataoff - optoff; + + for (i = 0; i < optlen; ) { + /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in + * the length; the remaining options are at least 2B long. In + * all cases, the first byte contains the option type. In + * multi-byte options, the second byte contains the option + * length, which must be at least two: 1 for the type plus 1 for + * the length plus 0-253 for any following option data. We + * aren't interested in the option data, only the type and the + * length, so we don't need to read more than two bytes at a + * time. + */ + unsigned int buflen = optlen - i; + u8 buf[2], *bufp; + u8 type, len; + + if (buflen > sizeof(buf)) + buflen = sizeof(buf); + + bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, + &buf); + if (!bufp) + goto err; + + type = bufp[0]; + + if (type == priv->type) { + *dest = 1; + return; + } + + if (type <= DCCPO_MAX_RESERVED) { + i++; + continue; + } + + if (buflen < 2) + goto err; + + len = bufp[1]; + + if (len < 2) + goto err; + + i += len; + } + +err: + *dest = 0; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, - [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, + [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, - [NFTA_EXTHDR_OP] = { .type = NLA_U32 }, + [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; @@ -557,6 +634,22 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) + return -EOPNOTSUPP; + + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -686,6 +779,15 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .reduce = nft_exthdr_reduce, }; +static const struct nft_expr_ops nft_exthdr_dccp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_dccp_eval, + .init = nft_exthdr_dccp_init, + .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -720,6 +822,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; + case NFT_EXTHDR_OP_DCCP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_dccp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index e860d8fe0e5e..5ef9146e74ad 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -250,9 +250,14 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, break; } + if (!dst_hold_safe(this_dst)) + return -ENOENT; + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) + if (!other_dst) { + dst_release(this_dst); return -ENOENT; + } nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); @@ -349,8 +354,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; - if (flow_offload_route_init(flow, &route) < 0) - goto err_flow_add; + flow_offload_route_init(flow, &route); if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; @@ -361,12 +365,12 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (ret < 0) goto err_flow_add; - dst_release(route.tuple[!dir].dst); return; err_flow_add: flow_offload_free(flow); err_flow_alloc: + dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 7b9d4d1bd17c..a5268e6dd32f 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -40,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, - [NFTA_FWD_NFPROTO] = { .type = NLA_U32 }, + [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index ee8d487b69c0..92d47e469204 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -59,7 +59,7 @@ static void nft_symhash_eval(const struct nft_expr *expr, static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, - [NFTA_HASH_LEN] = { .type = NLA_U32 }, + [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 03ef4fdaa460..29ac48cdd6db 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -19,6 +19,7 @@ struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; + bool dreg_set; bool invert; struct nft_set_binding binding; }; @@ -75,7 +76,7 @@ void nft_lookup_eval(const struct nft_expr *expr, } if (ext) { - if (set->flags & NFT_SET_MAP) + if (priv->dreg_set) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); @@ -122,11 +123,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (flags & ~NFT_LOOKUP_F_INV) return -EINVAL; - if (flags & NFT_LOOKUP_F_INV) { - if (set->flags & NFT_SET_MAP) - return -EINVAL; + if (flags & NFT_LOOKUP_F_INV) priv->invert = true; - } } if (tb[NFTA_LOOKUP_DREG] != NULL) { @@ -140,8 +138,17 @@ static int nft_lookup_init(const struct nft_ctx *ctx, set->dlen); if (err < 0) return err; - } else if (set->flags & NFT_SET_MAP) - return -EINVAL; + priv->dreg_set = true; + } else if (set->flags & NFT_SET_MAP) { + /* Map given, but user asks for lookup only (i.e. to + * ignore value assoicated with key). + * + * This makes no sense for anonymous maps since they are + * scoped to the rule, but for named sets this can be useful. + */ + if (set->flags & NFT_SET_ANONYMOUS) + return -EINVAL; + } priv->binding.flags = set->flags & NFT_SET_MAP; @@ -188,7 +195,7 @@ static int nft_lookup_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; - if (priv->set->flags & NFT_SET_MAP) + if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e384e0de7a54..8fdc7318c03c 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -458,7 +458,7 @@ EXPORT_SYMBOL_GPL(nft_meta_set_eval); const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, - [NFTA_META_KEY] = { .type = NLA_U32 }, + [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_META_SREG] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_meta_policy); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 3a3c7746e88f..8cb800989947 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -171,7 +171,8 @@ void nft_payload_eval(const struct nft_expr *expr, if (!skb_mac_header_was_set(skb)) goto err; - if (skb_vlan_tag_present(skb)) { + if (skb_vlan_tag_present(skb) && + priv->offset >= offsetof(struct ethhdr, h_proto)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 0566d6aaf1e5..51ae64cd268f 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -42,7 +42,7 @@ void nft_range_eval(const struct nft_expr *expr, static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { [NFTA_RANGE_SREG] = { .type = NLA_U32 }, - [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f2addc844dd2..ed2e668474d6 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -18,7 +18,7 @@ #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 5990fdd7b3cc..35a2c28caa60 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -104,7 +104,7 @@ err: static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_DREG] = { .type = NLA_U32 }, - [NFTA_RT_KEY] = { .type = NLA_U32 }, + [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_rt_get_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 0452ee586c1c..db526cb7a485 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1274,8 +1274,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, - GFP_KERNEL); + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); @@ -2084,8 +2083,7 @@ static int nft_pipapo_init(const struct nft_set *set, if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, - GFP_KERNEL); + m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); if (!m) return -ENOMEM; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 85f8df87efda..84def74698b7 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -138,9 +138,9 @@ static void nft_socket_eval(const struct nft_expr *expr, } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { - [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, + [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, - [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_socket_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index ea83f661417e..ae15cd693f0e 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -183,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { - [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 }, + [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index b059aa541798..9f21953c7433 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -66,9 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr, } static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { - [NFTA_TUNNEL_KEY] = { .type = NLA_U32 }, + [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, - [NFTA_TUNNEL_MODE] = { .type = NLA_U32 }, + [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_tunnel_get_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index c88fd078a9ae..452f8587adda 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -16,9 +16,9 @@ #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { - [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, - [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; |