From 4a92602aa1cd5bbaeedbd9536ff992f7d26fe9d1 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Fri, 5 Feb 2016 09:20:52 -0700 Subject: openvswitch: allow management from inside user namespaces Operations with the GENL_ADMIN_PERM flag fail permissions checks because this flag means we call netlink_capable, which uses the init user ns. Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations which should be allowed inside a user namespace. The motivation for this is to be able to run openvswitch in unprivileged containers. I've tested this and it seems to work, but I really have no idea about the security consequences of this patch, so thoughts would be much appreciated. v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function v3: use separate ifs for UNS_ADMIN_PERM and ADMIN_PERM, instead of one massive one Reported-by: James Page Signed-off-by: Tycho Andersen CC: Eric Biederman CC: Pravin Shelar CC: Justin Pettit CC: "David S. Miller" Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index deadfdab1bc3..d6f7fe92744a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -654,7 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = packet_policy, .doit = ovs_packet_cmd_execute } @@ -1391,12 +1391,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_del }, @@ -1407,7 +1407,7 @@ static const struct genl_ops dp_flow_genl_ops[] = { .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_set, }, @@ -1777,12 +1777,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_del }, @@ -1793,7 +1793,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = { .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_set, }, @@ -2158,12 +2158,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_del }, @@ -2174,7 +2174,7 @@ static const struct genl_ops dp_vport_genl_ops[] = { .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_set, }, -- cgit v1.2.3 From d71785ffc7e7cae3fbdc4ea8a9d05b7a1c59f7b8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:57 +0100 Subject: net: add dst_cache to ovs vxlan lwtunnel In case of UDP traffic with datagram length below MTU this give about 2% performance increase when tunneling over ipv4 and about 60% when tunneling over ipv6 Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 15 ++++++++------- include/net/dst_metadata.h | 1 + include/net/ip_tunnels.h | 3 +++ net/core/dst.c | 10 +++++++++- net/openvswitch/Kconfig | 1 + net/openvswitch/flow_netlink.c | 6 ++++++ 6 files changed, 28 insertions(+), 8 deletions(-) (limited to 'net/openvswitch') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ad673037bd73..ee1206d9f8df 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1775,7 +1775,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, /* when the ip_tunnel_info is availble, the tos used for lookup is * packet independent, so we can use the cache */ - if (dst_cache && !skb->mark && (!tos || info)) { + if (!skb->mark && (!tos || info)) { use_cache = true; rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) @@ -1806,13 +1806,11 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct in6_addr *saddr, struct dst_cache *dst_cache) { - bool use_cache = false; struct dst_entry *ndst; struct flowi6 fl6; int err; - if (dst_cache && !skb->mark) { - use_cache = true; + if (!skb->mark) { ndst = dst_cache_get_ip6(dst_cache, saddr); if (ndst) return ndst; @@ -1832,7 +1830,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, return ERR_PTR(err); *saddr = fl6.saddr; - if (use_cache) + if (!skb->mark) dst_cache_set_ip6(dst_cache, ndst, saddr); return ndst; } @@ -1886,6 +1884,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { + struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk; @@ -1910,6 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = rdst->remote_vni; dst = &rdst->remote_ip; + dst_cache = &rdst->dst_cache; } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", @@ -1924,6 +1924,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, else remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; dst = &remote_ip; + dst_cache = &info->dst_cache; } if (vxlan_addr_any(dst)) { @@ -1976,7 +1977,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rt = vxlan_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, tos, dst->sin.sin_addr.s_addr, &saddr, - rdst ? &rdst->dst_cache : NULL, info); + dst_cache, info); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &dst->sin.sin_addr.s_addr); @@ -2029,7 +2030,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, &dst->sin6.sin6_addr, &saddr, - rdst ? &rdst->dst_cache : NULL); + dst_cache); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 30a56ab2ccfb..84b833af6882 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -62,6 +62,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, sizeof(a->u.tun_info) + a->u.tun_info.options_len); } +void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index fd36936d85a6..87408ab80856 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -58,6 +58,9 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; +#ifdef CONFIG_DST_CACHE + struct dst_cache dst_cache; +#endif u8 options_len; u8 mode; }; diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3b8d72..b5cbbe07f786 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d143aa9f6654..cd5fd9d728a7 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -10,6 +10,7 @@ config OPENVSWITCH select LIBCRC32C select MPLS select NET_MPLS_GSO + select DST_CACHE ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1bd4a45ca2d..58b8efc23668 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (!tun_dst) return -ENOMEM; + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, sizeof(*ovs_tun), log); if (IS_ERR(a)) { -- cgit v1.2.3 From 551ddc057e290cc07ff900050da242f35d615d3c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:25 +0100 Subject: openvswitch: Revert: "Enable memory mapped Netlink i/o" revert commit 795449d8b846 ("openvswitch: Enable memory mapped Netlink i/o"). Following the mmaped netlink removal this code can be removed. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d6f7fe92744a..35a2659a277e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; - struct genl_info info = { - .dst_sk = ovs_dp_get_net(dp)->genl_sock, - .snd_portid = upcall_info->portid, - }; size_t len; unsigned int hlen; int err, dp_ifindex; @@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, hlen = skb->len; len = upcall_msg_size(upcall_info, hlen); - user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); - skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -1483,7 +1479,7 @@ error: static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) { - return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); + return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ -- cgit v1.2.3 From 263ea09084d172cac6e40459a690babe8de8e448 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:26 +0100 Subject: Revert "genl: Add genlmsg_new_unicast() for unicast message allocation" This reverts commit bb9b18fb55b0 ("genl: Add genlmsg_new_unicast() for unicast message allocation")'. Nothing wrong with it; its no longer needed since this was only for mmapped netlink support. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ---- net/netlink/genetlink.c | 21 --------------------- net/openvswitch/datapath.c | 10 +++++----- net/tipc/netlink_compat.c | 1 - 4 files changed, 5 insertions(+), 31 deletions(-) (limited to 'net/openvswitch') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 43c0e771f417..8d4608ce8716 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -83,7 +83,6 @@ struct genl_family { * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers - * @dst_sk: destination socket */ struct genl_info { u32 snd_seq; @@ -94,7 +93,6 @@ struct genl_info { struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; - struct sock * dst_sk; }; static inline struct net *genl_info_net(struct genl_info *info) @@ -188,8 +186,6 @@ int genl_unregister_family(struct genl_family *family); void genl_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, struct genl_family *family, int flags, u8 cmd); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0ffd721126e7..a09132a69869 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -462,26 +462,6 @@ int genl_unregister_family(struct genl_family *family) } EXPORT_SYMBOL(genl_unregister_family); -/** - * genlmsg_new_unicast - Allocate generic netlink message for unicast - * @payload: size of the message payload - * @info: information on destination - * @flags: the type of memory to allocate - * - * Allocates a new sk_buff large enough to cover the specified payload - * plus required Netlink headers. Will check receiving socket for - * memory mapped i/o capability and use it if enabled. Will fall back - * to non-mapped skb if message size exceeds the frame size of the ring. - */ -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags) -{ - size_t len = nlmsg_total_size(genlmsg_total_size(payload)); - - return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); -} -EXPORT_SYMBOL_GPL(genlmsg_new_unicast); - /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message @@ -642,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; - info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 35a2659a277e..c4e8455d5d56 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1477,7 +1477,7 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +static struct sk_buff *ovs_dp_cmd_alloc_info(void) { return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } @@ -1532,7 +1532,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1653,7 +1653,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1686,7 +1686,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1719,7 +1719,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 2c016fdefe97..de66d8f945ed 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1104,7 +1104,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) req_nlh = (struct nlmsghdr *)skb->data; msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; - msg.dst_sk = info->dst_sk; msg.net = genl_info_net(info); if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { -- cgit v1.2.3 From 6b83d28a55a891a9d70fc61ccb1c138e47dcbe74 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Feb 2016 00:29:30 +0100 Subject: net: use skb_postpush_rcsum instead of own implementations Replace individual implementations with the recently introduced skb_postpush_rcsum() helper. Signed-off-by: Daniel Borkmann Acked-by: Tom Herbert Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +--- net/ipv6/reassembly.c | 6 ++---- net/openvswitch/actions.c | 8 +++----- net/openvswitch/vport-netdev.c | 2 +- net/openvswitch/vport.h | 7 ------- 5 files changed, 7 insertions(+), 20 deletions(-) (limited to 'net/openvswitch') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a5bd067ec1a3..8bd4b7951bc0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4496,9 +4496,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) skb->mac_len += VLAN_HLEN; __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 18f3498a6c80..e2ea31175ef9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -496,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + skb_postpush_rcsum(head, skb_network_header(head), + skb_network_header_len(head)); rcu_read_lock(); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2d59df521915..e9dd47b2a85b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, - MPLS_HLEN, 0)); + skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); hdr = eth_hdr(skb); hdr->h_proto = mpls->mpls_ethertype; @@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, mask->eth_dst); - ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); @@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); memcpy(skb->data, &data->l2_data, data->l2_len); - ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); skb_reset_mac_header(skb); ovs_vport_send(vport, skb); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 6a6adf314363..4e3972344aa6 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb) return; skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index c10899cb9040..f01f28a567ad 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv) int ovs_vport_receive(struct vport *, struct sk_buff *, const struct ip_tunnel_info *); -static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); -} - static inline const char *ovs_vport_name(struct vport *vport) { return vport->dev->name; -- cgit v1.2.3 From 3a927bc7cf9d0fbe8f4a8189dd5f8440228f64e7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:39 +0100 Subject: ovs: propagate per dp max headroom to all vports This patch implements bookkeeping support to compute the maximum headroom for all the devices in each datapath. When said value changes, the underlying devs are notified via the ndo_set_rx_headroom method. This also increases the internal vports xmit performance. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 40 ++++++++++++++++++++++++++++++++++++ net/openvswitch/datapath.h | 4 ++++ net/openvswitch/vport-internal_dev.c | 10 ++++++++- 3 files changed, 53 insertions(+), 1 deletion(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c4e8455d5d56..e6a7d494df24 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net, return ERR_PTR(-EINVAL); } +/* Called with ovs_mutex */ +static void update_headroom(struct datapath *dp) +{ + unsigned dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + dp->max_headroom = max_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + netdev_set_rx_headroom(vport->dev, max_headroom); +} + static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1973,6 +1996,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); + + if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) + update_headroom(dp); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + BUG_ON(err < 0); ovs_unlock(); @@ -2039,8 +2068,10 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool must_update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; int err; @@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL); BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + must_update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); + + if (must_update_headroom) + update_headroom(dp); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 67bdecd9fdc1..427e39a045cf 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -68,6 +68,8 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @max_headroom: the maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -89,6 +91,8 @@ struct datapath { possible_net_t net; u32 user_features; + + u32 max_headroom; }; /** diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index ec76398a792f..83a5534abd31 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } +void internal_set_rx_headroom(struct net_device *dev, int new_hr) +{ + dev->needed_headroom = new_hr; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, @@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, + .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_PHONY_HEADROOM; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } + vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); -- cgit v1.2.3 From 6f15cdbf8a8ac2e22767cc8b1eae225702733c95 Mon Sep 17 00:00:00 2001 From: Samuel Gauthier Date: Thu, 10 Mar 2016 17:14:59 +0100 Subject: ovs: allow nl 'flow set' to use ufid without flow key When we want to change a flow using netlink, we have to identify it to be able to perform a lookup. Both the flow key and unique flow ID (ufid) are valid identifiers, but we always have to specify the flow key in the netlink message. When both attributes are there, the ufid is used. The flow key is used to validate the actions provided by the userland. This commit allows to use the ufid without having to provide the flow key, as it is already done in the netlink 'flow get' and 'flow del' path. The flow key remains mandatory when an action is provided. Signed-off-by: Samuel Gauthier Reviewed-by: Simon Horman Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e6a7d494df24..0cc66a4e492d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1096,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sw_flow_match match; struct sw_flow_id sfid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int error; + int error = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; - /* Extract key. */ - error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR(log, "Flow key attribute not present in set flow."); - goto error; - } - ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); - ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], - a[OVS_FLOW_ATTR_MASK], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + } else if (!ufid_present) { + OVS_NLERR(log, + "Flow set message rejected, Key attribute missing."); + error = -EINVAL; + } if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, + "Flow key attribute not present in set flow."); + error = -EINVAL; + goto error; + } + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, log); if (IS_ERR(acts)) { -- cgit v1.2.3 From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:16 -0800 Subject: netfilter: Remove IP_CT_NEW_REPLY definition. Remove the definition of IP_CT_NEW_REPLY from the kernel as it does not make sense. This allows the definition of IP_CT_NUMBER to be simplified as well. Signed-off-by: Jarno Rajahalme Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++--- net/openvswitch/conntrack.c | 2 -- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net/openvswitch') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 319f47128db8..6d074d14ee27 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -20,9 +20,15 @@ enum ip_conntrack_info { IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY, IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY, - IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY, - /* Number of distinct IP_CT types (no NEW in reply dirn). */ - IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1 + /* No NEW in reply direction. */ + + /* Number of distinct IP_CT types. */ + IP_CT_NUMBER, + + /* only for userspace compatibility */ +#ifndef __KERNEL__ + IP_CT_NEW_REPLY = IP_CT_NUMBER, +#endif }; #define NF_CT_STATE_INVALID_BIT (1 << 0) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index ee6ff8ffc12d..304529015744 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -75,7 +75,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) switch (ctinfo) { case IP_CT_ESTABLISHED_REPLY: case IP_CT_RELATED_REPLY: - case IP_CT_NEW_REPLY: ct_state |= OVS_CS_F_REPLY_DIR; break; default: @@ -92,7 +91,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) ct_state |= OVS_CS_F_RELATED; break; case IP_CT_NEW: - case IP_CT_NEW_REPLY: ct_state |= OVS_CS_F_NEW; break; default: -- cgit v1.2.3 From 9f13ded8d3c715147c4759f937cfb712c185ca13 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:18 -0800 Subject: openvswitch: Add commentary to conntrack.c This makes the code easier to understand and the following patches more focused. Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 304529015744..2c2bf071f6d6 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -152,8 +152,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb, ct = nf_ct_get(skb, &ctinfo); if (ct) { state = ovs_ct_get_state(ctinfo); + /* All unconfirmed entries are NEW connections. */ if (!nf_ct_is_confirmed(ct)) state |= OVS_CS_F_NEW; + /* OVS persists the related flag for the duration of the + * connection. + */ if (ct->master) state |= OVS_CS_F_RELATED; zone = nf_ct_zone(ct); @@ -165,6 +169,9 @@ static void ovs_ct_update_key(const struct sk_buff *skb, __ovs_ct_update_key(key, state, zone, ct); } +/* This is called to initialize CT key fields possibly coming in from the local + * stack. + */ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { ovs_ct_update_key(skb, NULL, key, false); @@ -199,7 +206,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, struct nf_conn *ct; u32 new_mark; - /* The connection could be invalid, in which case set_mark is no-op. */ ct = nf_ct_get(skb, &ctinfo); if (!ct) @@ -375,6 +381,11 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, return true; } +/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if + * not done already. Update key with new CT state. + * Note that if the packet is deemed invalid by conntrack, skb->nfct will be + * set to NULL and 0 will be returned. + */ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) @@ -418,6 +429,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, { struct nf_conntrack_expect *exp; + /* If we pass an expected packet through nf_conntrack_in() the + * expectation is typically removed, but the packet could still be + * lost in upcall processing. To prevent this from happening we + * perform an explicit expectation lookup. Expected connections are + * always new, and will be passed through conntrack only when they are + * committed, as it is OK to remove the expectation at that time. + */ exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); if (exp) { u8 state; @@ -455,6 +473,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; + /* This is a no-op if the connection has already been confirmed. */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) return -EINVAL; -- cgit v1.2.3 From 394e910e909b174270b8231fd51942eb2f541fb9 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:19 -0800 Subject: openvswitch: Update the CT state key only after nf_conntrack_in(). Only a successful nf_conntrack_in() call can effect a connection state change, so it suffices to update the key only after the nf_conntrack_in() returns. This change is needed for the later NAT patches. Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 2c2bf071f6d6..a487bb3486e1 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -382,7 +382,8 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, } /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if - * not done already. Update key with new CT state. + * not done already. Update key with new CT state after passing the packet + * through conntrack. * Note that if the packet is deemed invalid by conntrack, skb->nfct will be * set to NULL and 0 will be returned. */ @@ -411,14 +412,14 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, skb) != NF_ACCEPT) return -ENOENT; + ovs_ct_update_key(skb, info, key, true); + if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { WARN_ONCE(1, "helper rejected packet"); return -EINVAL; } } - ovs_ct_update_key(skb, info, key, true); - return 0; } -- cgit v1.2.3 From 289f225349cb2a97448fd14599ab34b741f706f3 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:20 -0800 Subject: openvswitch: Find existing conntrack entry after upcall. Add a new function ovs_ct_find_existing() to find an existing conntrack entry for which this packet was already applied to. This is only to be called when there is evidence that the packet was already tracked and committed, but we lost the ct reference due to an userspace upcall. ovs_ct_find_existing() is called from skb_nfct_cached(), which can now hide the fact that the ct reference may have been lost due to an upcall. This allows ovs_ct_commit() to be simplified. This patch is needed by later "openvswitch: Interface with NAT" patch, as we need to be able to pass the packet through NAT using the original ct reference also after the reference is lost after an upcall. Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 103 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 13 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a487bb3486e1..ae36fe2ed483 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -356,14 +356,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, return __nf_ct_expect_find(net, zone, &tuple); } +/* This replicates logic from nf_conntrack_core.c that is not exported. */ +static enum ip_conntrack_info +ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) +{ + const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + return IP_CT_ESTABLISHED_REPLY; + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + return IP_CT_ESTABLISHED; + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + return IP_CT_RELATED; + return IP_CT_NEW; +} + +/* Find an existing connection which this packet belongs to without + * re-attributing statistics or modifying the connection state. This allows an + * skb->nfct lost due to an upcall to be recovered during actions execution. + * + * Must be called with rcu_read_lock. + * + * On success, populates skb->nfct and skb->nfctinfo, and returns the + * connection. Returns NULL if there is no existing entry. + */ +static struct nf_conn * +ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, + u8 l3num, struct sk_buff *skb) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + unsigned int dataoff; + u8 protonum; + + l3proto = __nf_ct_l3proto_find(l3num); + if (!l3proto) { + pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); + return NULL; + } + if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, + &protonum) <= 0) { + pr_debug("ovs_ct_find_existing: Can't get protonum\n"); + return NULL; + } + l4proto = __nf_ct_l4proto_find(l3num, protonum); + if (!l4proto) { + pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); + return NULL; + } + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, + protonum, net, &tuple, l3proto, l4proto)) { + pr_debug("ovs_ct_find_existing: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return NULL; /* Not found. */ + + ct = nf_ct_tuplehash_to_ctrack(h); + + ctinfo = ovs_ct_get_info(h); + if (ctinfo == IP_CT_NEW) { + /* This should not happen. */ + WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct); + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = ctinfo; + return ct; +} + /* Determine whether skb->nfct is equal to the result of conntrack lookup. */ -static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, - const struct ovs_conntrack_info *info) +static bool skb_nfct_cached(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); + /* If no ct, check if we have evidence that an existing conntrack entry + * might be found for this skb. This happens when we lose a skb->nfct + * due to an upcall. If the connection was not confirmed, it is not + * cached and needs to be run through conntrack again. + */ + if (!ct && key->ct.state & OVS_CS_F_TRACKED && + !(key->ct.state & OVS_CS_F_INVALID) && + key->ct.zone == info->zone.id) + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); if (!ct) return false; if (!net_eq(net, read_pnet(&ct->ct_net))) @@ -396,7 +483,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * actually run the packet through conntrack twice unless it's for a * different zone. */ - if (!skb_nfct_cached(net, skb, info)) { + if (!skb_nfct_cached(net, key, info, skb)) { struct nf_conn *tmpl = info->ct; /* Associate skb with specified zone. */ @@ -459,18 +546,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - u8 state; int err; - state = key->ct.state; - if (key->ct.zone == info->zone.id && - ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { - /* Previous lookup has shown that this connection is already - * tracked and committed. Skip committing. - */ - return 0; - } - err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; -- cgit v1.2.3 From 5b6b929376a621e2bd3367f5de563d7123506597 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:21 -0800 Subject: openvswitch: Handle NF_REPEAT in conntrack action. Repeat the nf_conntrack_in() call when it returns NF_REPEAT. This avoids dropping a SYN packet re-opening an existing TCP connection. Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index ae36fe2ed483..85256b312455 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -485,6 +485,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, */ if (!skb_nfct_cached(net, key, info, skb)) { struct nf_conn *tmpl = info->ct; + int err; /* Associate skb with specified zone. */ if (tmpl) { @@ -495,8 +496,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, skb->nfctinfo = IP_CT_NEW; } - if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, - skb) != NF_ACCEPT) + /* Repeat if requested, see nf_iterate(). */ + do { + err = nf_conntrack_in(net, info->family, + NF_INET_PRE_ROUTING, skb); + } while (err == NF_REPEAT); + + if (err != NF_ACCEPT) return -ENOENT; ovs_ct_update_key(skb, info, key, true); -- cgit v1.2.3 From 28b6e0c1ace45779c60e7cefe6d469b7ecb520b8 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:22 -0800 Subject: openvswitch: Delay conntrack helper call for new connections. There is no need to help connections that are not confirmed, so we can delay helping new connections to the time when they are confirmed. This change is needed for NAT support, and having this as a separate patch will make the following NAT patch a bit easier to review. Signed-off-by: Jarno Rajahalme Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/openvswitch/conntrack.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 85256b312455..f718b724e650 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -483,7 +483,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * actually run the packet through conntrack twice unless it's for a * different zone. */ - if (!skb_nfct_cached(net, key, info, skb)) { + bool cached = skb_nfct_cached(net, key, info, skb); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!cached) { struct nf_conn *tmpl = info->ct; int err; @@ -506,11 +510,18 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, return -ENOENT; ovs_ct_update_key(skb, info, key, true); + } - if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { - WARN_ONCE(1, "helper rejected packet"); - return -EINVAL; - } + /* Call the helper only if: + * - nf_conntrack_in() was executed above ("!cached") for a confirmed + * connection, or + * - When committing an unconfirmed connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) && + ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + WARN_ONCE(1, "helper rejected packet"); + return -EINVAL; } return 0; -- cgit v1.2.3 From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:23 -0800 Subject: openvswitch: Interface with NAT. Extend OVS conntrack interface to cover NAT. New nested OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action. A bare OVS_CT_ATTR_NAT only mangles existing and expected connections. If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested attributes, new (non-committed/non-confirmed) connections are mangled according to the rest of the nested attributes. The corresponding OVS userspace patch series includes test cases (in tests/system-traffic.at) that also serve as example uses. This work extends on a branch by Thomas Graf at https://github.com/tgraf/ovs/tree/nat. Signed-off-by: Jarno Rajahalme Acked-by: Thomas Graf Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/openvswitch.h | 49 ++++ net/openvswitch/Kconfig | 3 +- net/openvswitch/conntrack.c | 524 +++++++++++++++++++++++++++++++++++++-- net/openvswitch/conntrack.h | 3 +- 4 files changed, 551 insertions(+), 28 deletions(-) (limited to 'net/openvswitch') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a27222d5b413..616d04761730 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -454,6 +454,14 @@ struct ovs_key_ct_labels { #define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */ #define OVS_CS_F_INVALID 0x10 /* Could not track connection. */ #define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */ +#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was + * mangled by NAT. + */ +#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port + * was mangled by NAT. + */ + +#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. @@ -632,6 +640,8 @@ struct ovs_action_hash { * mask. For each bit set in the mask, the corresponding bit in the value is * copied to the connection tracking label field in the connection. * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG. + * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address + * translation (NAT) on the packet. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -641,11 +651,50 @@ enum ovs_ct_attr { OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */ OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of related connections. */ + OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ __OVS_CT_ATTR_MAX }; #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1) +/** + * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT. + * + * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port). + * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination + * address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be + * specified. Effective only for packets for ct_state NEW connections. + * Packets of committed connections are mangled by the NAT action according to + * the committed NAT type regardless of the flags specified. As a corollary, a + * NAT action without a NAT type flag will only mangle packets of committed + * connections. The following NAT attributes only apply for NEW + * (non-committed) connections, and they may be included only when the CT + * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or + * @OVS_NAT_ATTR_DST is also included. + * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port) + * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port) + * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots + * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5) + * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping + */ +enum ovs_nat_attr { + OVS_NAT_ATTR_UNSPEC, + OVS_NAT_ATTR_SRC, + OVS_NAT_ATTR_DST, + OVS_NAT_ATTR_IP_MIN, + OVS_NAT_ATTR_IP_MAX, + OVS_NAT_ATTR_PROTO_MIN, + OVS_NAT_ATTR_PROTO_MAX, + OVS_NAT_ATTR_PERSISTENT, + OVS_NAT_ATTR_PROTO_HASH, + OVS_NAT_ATTR_PROTO_RANDOM, + __OVS_NAT_ATTR_MAX, +}; + +#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1) + /** * enum ovs_action_attr - Action types. * diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index cd5fd9d728a7..234a73344c6e 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -6,7 +6,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET depends on !NF_CONNTRACK || \ - (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) + (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ + (!NF_NAT || NF_NAT))) select LIBCRC32C select MPLS select NET_MPLS_GSO diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index f718b724e650..dc5eb29fe7d6 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -13,21 +13,31 @@ #include #include +#include +#include +#include #include #include #include #include +#include #include #include +#ifdef CONFIG_NF_NAT_NEEDED +#include +#include +#include +#endif + #include "datapath.h" #include "conntrack.h" #include "flow.h" #include "flow_netlink.h" struct ovs_ct_len_tbl { - size_t maxlen; - size_t minlen; + int maxlen; + int minlen; }; /* Metadata mark for masked write to conntrack mark */ @@ -42,15 +52,25 @@ struct md_labels { struct ovs_key_ct_labels mask; }; +enum ovs_ct_nat { + OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ + OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ + OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ +}; + /* Conntrack action context for execution. */ struct ovs_conntrack_info { struct nf_conntrack_helper *helper; struct nf_conntrack_zone zone; struct nf_conn *ct; u8 commit : 1; + u8 nat : 3; /* enum ovs_ct_nat */ u16 family; struct md_mark mark; struct md_labels labels; +#ifdef CONFIG_NF_NAT_NEEDED + struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ +#endif }; static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); @@ -137,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, ovs_ct_get_labels(ct, &key->ct.labels); } -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has - * previously sent the packet to conntrack via the ct action. +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. If + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are + * initialized from the connection status. */ static void ovs_ct_update_key(const struct sk_buff *skb, const struct ovs_conntrack_info *info, - struct sw_flow_key *key, bool post_ct) + struct sw_flow_key *key, bool post_ct, + bool keep_nat_flags) { const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; @@ -160,6 +183,14 @@ static void ovs_ct_update_key(const struct sk_buff *skb, */ if (ct->master) state |= OVS_CS_F_RELATED; + if (keep_nat_flags) { + state |= key->ct.state & OVS_CS_F_NAT_MASK; + } else { + if (ct->status & IPS_SRC_NAT) + state |= OVS_CS_F_SRC_NAT; + if (ct->status & IPS_DST_NAT) + state |= OVS_CS_F_DST_NAT; + } zone = nf_ct_zone(ct); } else if (post_ct) { state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; @@ -174,7 +205,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb, */ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { - ovs_ct_update_key(skb, NULL, key, false); + ovs_ct_update_key(skb, NULL, key, false, false); } int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) @@ -263,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) enum ip_conntrack_info ctinfo; unsigned int protoff; struct nf_conn *ct; + int err; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) @@ -299,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) return NF_DROP; } - return helper->help(skb, protoff, ct, ctinfo); + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; } /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero @@ -468,6 +511,200 @@ static bool skb_nfct_cached(struct net *net, return true; } +#ifdef CONFIG_NF_NAT_NEEDED +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype) +{ + int hooknum, nh_off, err = NF_ACCEPT; + + nh_off = skb_network_offset(skb); + skb_pull(skb, nh_off); + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (skb->protocol == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto push; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto push; + } +#endif + } + /* Non-ICMP, fall thru to initialize if needed. */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto push; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto push; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); +push: + skb_push(skb, nh_off); + + return err; +} + +static void ovs_nat_update_key(struct sw_flow_key *key, + const struct sk_buff *skb, + enum nf_nat_manip_type maniptype) +{ + if (maniptype == NF_NAT_MANIP_SRC) { + __be16 src; + + key->ct.state |= OVS_CS_F_SRC_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.src = ip_hdr(skb)->saddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, + sizeof(key->ipv6.addr.src)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + src = udp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_TCP) + src = tcp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_SCTP) + src = sctp_hdr(skb)->source; + else + return; + + key->tp.src = src; + } else { + __be16 dst; + + key->ct.state |= OVS_CS_F_DST_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.dst = ip_hdr(skb)->daddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, + sizeof(key->ipv6.addr.dst)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + dst = udp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_TCP) + dst = tcp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_SCTP) + dst = sctp_hdr(skb)->dest; + else + return; + + key->tp.dst = dst; + } +} + +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + enum nf_nat_manip_type maniptype; + int err; + + if (nf_ct_is_untracked(ct)) { + /* A NAT action may only be performed on tracked packets. */ + return NF_ACCEPT; + } + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_ACCEPT; /* Can't NAT. */ + + /* Determine NAT type. + * Check if the NAT type can be deduced from the tracked connection. + * Make sure expected traffic is NATted only when committing. + */ + if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && + ct->status & IPS_NAT_MASK && + (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (info->nat & OVS_CT_SRC_NAT) { + maniptype = NF_NAT_MANIP_SRC; + } else if (info->nat & OVS_CT_DST_NAT) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; /* Connection is not NATed. */ + } + err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); + + /* Mark NAT done if successful and update the flow key. */ + if (err == NF_ACCEPT) + ovs_nat_update_key(key, skb, maniptype); + + return err; +} +#else /* !CONFIG_NF_NAT_NEEDED */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + return NF_ACCEPT; +} +#endif + /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if * not done already. Update key with new CT state after passing the packet * through conntrack. @@ -509,19 +746,43 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, if (err != NF_ACCEPT) return -ENOENT; - ovs_ct_update_key(skb, info, key, true); + /* Clear CT state NAT flags to mark that we have not yet done + * NAT after the nf_conntrack_in() call. We can actually clear + * the whole state, as it will be re-initialized below. + */ + key->ct.state = 0; + + /* Update the key, but keep the NAT flags. */ + ovs_ct_update_key(skb, info, key, true, true); } - /* Call the helper only if: - * - nf_conntrack_in() was executed above ("!cached") for a confirmed - * connection, or - * - When committing an unconfirmed connection. - */ ct = nf_ct_get(skb, &ctinfo); - if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) && - ovs_ct_helper(skb, info->family) != NF_ACCEPT) { - WARN_ONCE(1, "helper rejected packet"); - return -EINVAL; + if (ct) { + /* Packets starting a new connection must be NATted before the + * helper, so that the helper knows about the NAT. We enforce + * this by delaying both NAT and helper calls for unconfirmed + * connections until the committing CT action. For later + * packets NAT and Helper may be called in either order. + * + * NAT will be done only if the CT action has NAT, and only + * once per packet (per zone), as guarded by the NAT bits in + * the key->ct.state. + */ + if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && + (nf_ct_is_confirmed(ct) || info->commit) && + ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { + return -EINVAL; + } + + /* Call the helper only if: + * - nf_conntrack_in() was executed above ("!cached") for a + * confirmed connection, or + * - When committing an unconfirmed connection. + */ + if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && + ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + return -EINVAL; + } } return 0; @@ -545,15 +806,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, if (exp) { u8 state; + /* NOTE: New connections are NATted and Helped only when + * committed, so we are not calling into NAT here. + */ state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - } + } else + return __ovs_ct_lookup(net, key, info, skb); return 0; } @@ -653,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return 0; } +#ifdef CONFIG_NF_NAT_NEEDED +static int parse_nat(const struct nlattr *attr, + struct ovs_conntrack_info *info, bool log) +{ + struct nlattr *a; + int rem; + bool have_ip_max = false; + bool have_proto_max = false; + bool ip_vers = (info->family == NFPROTO_IPV6); + + nla_for_each_nested(a, attr, rem) { + static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { + [OVS_NAT_ATTR_SRC] = {0, 0}, + [OVS_NAT_ATTR_DST] = {0, 0}, + [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, + [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, + [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, + }; + int type = nla_type(a); + + if (type > OVS_NAT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown NAT attribute (type=%d, max=%d).\n", + type, OVS_NAT_ATTR_MAX); + return -EINVAL; + } + + if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { + OVS_NLERR(log, + "NAT attribute type %d has unexpected length (%d != %d).\n", + type, nla_len(a), + ovs_nat_attr_lens[type][ip_vers]); + return -EINVAL; + } + + switch (type) { + case OVS_NAT_ATTR_SRC: + case OVS_NAT_ATTR_DST: + if (info->nat) { + OVS_NLERR(log, + "Only one type of NAT may be specified.\n" + ); + return -ERANGE; + } + info->nat |= OVS_CT_NAT; + info->nat |= ((type == OVS_NAT_ATTR_SRC) + ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); + break; + + case OVS_NAT_ATTR_IP_MIN: + nla_memcpy(&info->range.min_addr, a, nla_len(a)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_IP_MAX: + have_ip_max = true; + nla_memcpy(&info->range.max_addr, a, + sizeof(info->range.max_addr)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_PROTO_MIN: + info->range.min_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PROTO_MAX: + have_proto_max = true; + info->range.max_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PERSISTENT: + info->range.flags |= NF_NAT_RANGE_PERSISTENT; + break; + + case OVS_NAT_ATTR_PROTO_HASH: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; + break; + + case OVS_NAT_ATTR_PROTO_RANDOM: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; + break; + + default: + OVS_NLERR(log, "Unknown nat attribute (%d).\n", type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem); + return -EINVAL; + } + if (!info->nat) { + /* Do not allow flags if no type is given. */ + if (info->range.flags) { + OVS_NLERR(log, + "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n" + ); + return -EINVAL; + } + info->nat = OVS_CT_NAT; /* NAT existing connections. */ + } else if (!info->commit) { + OVS_NLERR(log, + "NAT attributes may be specified only when CT COMMIT flag is also specified.\n" + ); + return -EINVAL; + } + /* Allow missing IP_MAX. */ + if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { + memcpy(&info->range.max_addr, &info->range.min_addr, + sizeof(info->range.max_addr)); + } + /* Allow missing PROTO_MAX. */ + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + !have_proto_max) { + info->range.max_proto.all = info->range.min_proto.all; + } + return 0; +} +#endif + static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), @@ -662,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), .maxlen = sizeof(struct md_labels) }, [OVS_CT_ATTR_HELPER] = { .minlen = 1, - .maxlen = NF_CT_HELPER_NAME_LEN } + .maxlen = NF_CT_HELPER_NAME_LEN }, +#ifdef CONFIG_NF_NAT_NEEDED + /* NAT length is checked when parsing the nested attributes. */ + [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, +#endif }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -729,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, return -EINVAL; } break; +#ifdef CONFIG_NF_NAT_NEEDED + case OVS_CT_ATTR_NAT: { + int err = parse_nat(a, info, log); + + if (err) + return err; + break; + } +#endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", type); @@ -816,6 +1217,74 @@ err_free_ct: return err; } +#ifdef CONFIG_NF_NAT_NEEDED +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_CT_ATTR_NAT); + if (!start) + return false; + + if (info->nat & OVS_CT_SRC_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) + return false; + } else if (info->nat & OVS_CT_DST_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) + return false; + } else { + goto out; + } + + if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { + if (info->family == NFPROTO_IPV4) { + if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, + info->range.min_addr.ip) || + (info->range.max_addr.ip + != info->range.min_addr.ip && + (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, + info->range.max_addr.ip)))) + return false; +#if IS_ENABLED(CONFIG_NF_NAT_IPV6) + } else if (info->family == NFPROTO_IPV6) { + if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, + &info->range.min_addr.in6) || + (memcmp(&info->range.max_addr.in6, + &info->range.min_addr.in6, + sizeof(info->range.max_addr.in6)) && + (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, + &info->range.max_addr.in6)))) + return false; +#endif + } else { + return false; + } + } + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, + ntohs(info->range.min_proto.all)) || + (info->range.max_proto.all != info->range.min_proto.all && + nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, + ntohs(info->range.max_proto.all))))) + return false; + + if (info->range.flags & NF_NAT_RANGE_PERSISTENT && + nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) + return false; +out: + nla_nest_end(skb, start); + + return true; +} +#endif + int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, struct sk_buff *skb) { @@ -844,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, ct_info->helper->name)) return -EMSGSIZE; } - +#ifdef CONFIG_NF_NAT_NEEDED + if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) + return -EMSGSIZE; +#endif nla_nest_end(skb, start); return 0; diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index a7544f405c16..8f6230bd6183 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a); #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ - OVS_CS_F_INVALID | OVS_CS_F_TRACKED) + OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \ + OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) #else #include -- cgit v1.2.3 From e014e8468552236f0f0cb64c7c341c1dce506070 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 19 Mar 2016 00:54:50 +0800 Subject: ovs: internal_set_rx_headroom() can be static Signed-off-by: Fengguang Wu Signed-off-by: David S. Miller --- net/openvswitch/vport-internal_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 83a5534abd31..7c8b90bf0e54 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,7 +138,7 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } -void internal_set_rx_headroom(struct net_device *dev, int new_hr) +static void internal_set_rx_headroom(struct net_device *dev, int new_hr) { dev->needed_headroom = new_hr; } -- cgit v1.2.3 From fe3a5f6c795810edb1646a840fec3c8c350c2a4e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 15 Mar 2016 12:00:35 +0900 Subject: openvswitch: allow output of MPLS packets on tunnel vports Currently output of MPLS packets on tunnel vports is not allowed by Open vSwitch. This is because historically encapsulation was done in such a way that the inner_protocol field of the skb needed to hold the inner protocol for both MPLS and tunnel encapsulation in order for GSO segmentation to be performed correctly. Since b2acd1dc3949 ("openvswitch: Use regular GRE net_device instead of vport") Open vSwitch makes use of lwt to output to tunnel netdevs which perform encapsulation. As no drivers expose support for MPLS offloads this means that GSO packets are segmented in software by validate_xmit_skb(), which is called from __dev_queue_xmit(), before tunnel encapsulation occurs. This means that the inner protocol of MPLS is no longer needed by the time encapsulation occurs and the contention on the inner_protocol field of the skb no longer occurs. Thus it is now safe to output MPLS to tunnel vports. Signed-off-by: Simon Horman Reviewed-by: Jesse Gross Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/openvswitch') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 58b8efc23668..689c17264221 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2044,9 +2044,6 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_TUNNEL: - if (eth_p_mpls(eth_type)) - return -EINVAL; - if (masked) return -EINVAL; /* Masked tunnel set not supported. */ -- cgit v1.2.3 From fca5fdf67de9e092fda23c9eb059ba968e7b5267 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Mar 2016 01:42:51 +0100 Subject: ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it eBPF defines this as BPF_TUNLEN_MAX and OVS just uses the hard-coded value inside struct sw_flow_key. Thus, add and use IP_TUNNEL_OPTS_MAX for this, which makes the code a bit more generic and allows to remove BPF_TUNLEN_MAX from eBPF code. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 7 +++++++ net/core/filter.c | 9 ++------- net/ipv4/ip_tunnel_core.c | 6 ++++++ net/openvswitch/flow.h | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net/openvswitch') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5dc2e454f866..c35dda9ec991 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -7,6 +7,8 @@ #include #include #include +#include + #include #include #include @@ -57,6 +59,11 @@ struct ip_tunnel_key { #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ +/* Maximum tunnel options length. */ +#define IP_TUNNEL_OPTS_MAX \ + GENMASK((FIELD_SIZEOF(struct ip_tunnel_info, \ + options_len) * BITS_PER_BYTE) - 1, 0) + struct ip_tunnel_info { struct ip_tunnel_key key; #ifdef CONFIG_DST_CACHE diff --git a/net/core/filter.c b/net/core/filter.c index 4c35d8325c34..b7177d01ecb0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1904,8 +1904,6 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -#define BPF_TUNLEN_MAX 255 - static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; @@ -1915,7 +1913,7 @@ static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; - if (unlikely(size > BPF_TUNLEN_MAX)) + if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_info_opts_set(info, from, size); @@ -1936,13 +1934,10 @@ static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info, - options_len) != 1); - /* Race is not possible, since it's called from verifier * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX, + md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, GFP_KERNEL); if (!md_dst) return NULL; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index eaca2449a09a..d27276f6f8dd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -398,6 +398,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { void __init ip_tunnel_core_init(void) { + /* If you land here, make sure whether increasing ip_tunnel_info's + * options_len is a reasonable choice with its usage in front ends + * (f.e., it's part of flow keys, etc). + */ + BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 1d055c559eaf..03378e75a67c 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -55,7 +55,7 @@ struct ovs_tunnel_info { FIELD_SIZEOF(struct sw_flow_key, recirc_id)) struct sw_flow_key { - u8 tun_opts[255]; + u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { -- cgit v1.2.3