From 6b436d338177729ae5c5a58e0acc5f35df603f82 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 27 Oct 2014 19:03:22 +0100 Subject: ipv4: remove set but unused variable sha unsigned char *sha (source) was already in original git version but was never used. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 648fa1490ea7..a896da50f398 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -498,7 +498,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt struct arphdr *rarp; unsigned char *rarp_ptr; __be32 sip, tip; - unsigned char *sha, *tha; /* s for "source", t for "target" */ + unsigned char *tha; /* t for "target" */ struct ic_device *d; if (!net_eq(dev_net(dev), &init_net)) @@ -549,7 +549,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop_unlock; /* should never happen */ /* Extract variable-width fields */ - sha = rarp_ptr; rarp_ptr += dev->addr_len; memcpy(&sip, rarp_ptr, 4); rarp_ptr += 4; -- cgit v1.2.3 From 8b13eddfdf04cbfa561725cfc42d6868fe896f56 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 16 Oct 2014 12:23:29 +0200 Subject: netfilter: refactor NAT redirect IPv4 to use it from nf_tables This patch refactors the IPv4 code so it can be usable both from xt and nf_tables. A similar patch follows-up to handle IPv6. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_redirect.h | 9 +++ net/ipv4/netfilter/Kconfig | 6 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nf_nat_redirect_ipv4.c | 82 ++++++++++++++++++++++++++++ net/netfilter/Kconfig | 1 + net/netfilter/xt_REDIRECT.c | 44 +-------------- 6 files changed, 101 insertions(+), 42 deletions(-) create mode 100644 include/net/netfilter/ipv4/nf_nat_redirect.h create mode 100644 net/ipv4/netfilter/nf_nat_redirect_ipv4.c (limited to 'net/ipv4') diff --git a/include/net/netfilter/ipv4/nf_nat_redirect.h b/include/net/netfilter/ipv4/nf_nat_redirect.h new file mode 100644 index 000000000000..19e1df3a0a4d --- /dev/null +++ b/include/net/netfilter/ipv4/nf_nat_redirect.h @@ -0,0 +1,9 @@ +#ifndef _NF_NAT_REDIRECT_IPV4_H_ +#define _NF_NAT_REDIRECT_IPV4_H_ + +unsigned int +nf_nat_redirect_ipv4(struct sk_buff *skb, + const struct nf_nat_ipv4_multi_range_compat *mr, + unsigned int hooknum); + +#endif /* _NF_NAT_REDIRECT_IPV4_H_ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4c019d5c3f57..a300e2c32b26 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -104,6 +104,12 @@ config NF_NAT_MASQUERADE_IPV4 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection). +config NF_NAT_REDIRECT_IPV4 + tristate "IPv4 redirect support" + help + This is the kernel functionality to provide NAT in the redirect + flavour (redirect packets to local machine). + config NFT_MASQ_IPV4 tristate "IPv4 masquerading support for nf_tables" depends on NF_TABLES_IPV4 diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index f4cef5af0969..34e436c92015 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o +obj-$(CONFIG_NF_NAT_REDIRECT_IPV4) += nf_nat_redirect_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/nf_nat_redirect_ipv4.c b/net/ipv4/netfilter/nf_nat_redirect_ipv4.c new file mode 100644 index 000000000000..a220552fc532 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_redirect_ipv4.c @@ -0,0 +1,82 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * Copyright (c) 2011 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +nf_nat_redirect_ipv4(struct sk_buff *skb, + const struct nf_nat_ipv4_multi_range_compat *mr, + unsigned int hooknum) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 newdst; + struct nf_nat_range newrange; + + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_OUT); + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (hooknum == NF_INET_LOCAL_OUT) { + newdst = htonl(0x7F000001); + } else { + struct in_device *indev; + struct in_ifaddr *ifa; + + newdst = 0; + + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + if (indev != NULL) { + ifa = indev->ifa_list; + newdst = ifa->ifa_local; + } + rcu_read_unlock(); + + if (!newdst) + return NF_DROP; + } + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newdst; + newrange.max_addr.ip = newdst; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} +EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ae5096ab65eb..a0716a3f08b0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -835,6 +835,7 @@ config NETFILTER_XT_TARGET_RATEEST config NETFILTER_XT_TARGET_REDIRECT tristate "REDIRECT target support" depends on NF_NAT + select NF_NAT_REDIRECT_IPV4 ---help--- REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 22a10309297c..b4ffac5fe8e9 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -26,6 +26,7 @@ #include #include #include +#include static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; @@ -98,48 +99,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par) static unsigned int redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 newdst; - const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range newrange; - - NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_OUT); - - ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - - /* Local packets: make them go to loopback */ - if (par->hooknum == NF_INET_LOCAL_OUT) - newdst = htonl(0x7F000001); - else { - struct in_device *indev; - struct in_ifaddr *ifa; - - newdst = 0; - - rcu_read_lock(); - indev = __in_dev_get_rcu(skb->dev); - if (indev && (ifa = indev->ifa_list)) - newdst = ifa->ifa_local; - rcu_read_unlock(); - - if (!newdst) - return NF_DROP; - } - - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newdst; - newrange.max_addr.ip = newdst; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); + return nf_nat_redirect_ipv4(skb, par->targinfo, par->hooknum); } static struct xt_target redirect_tg_reg[] __read_mostly = { -- cgit v1.2.3 From e9105f1bead4ec3f64904564c7c6268185d6b363 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Fri, 17 Oct 2014 12:39:09 +0200 Subject: netfilter: nf_tables: add new expression nft_redir This new expression provides NAT in the redirect flavour, which is to redirect packets to local machine. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_redir.h | 21 +++++++ include/uapi/linux/netfilter/nf_tables.h | 16 ++++++ net/ipv4/netfilter/Kconfig | 9 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_redir_ipv4.c | 77 +++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 9 +++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_redir_ipv6.c | 77 +++++++++++++++++++++++++ net/netfilter/Kconfig | 9 +++ net/netfilter/Makefile | 1 + net/netfilter/nft_redir.c | 98 ++++++++++++++++++++++++++++++++ 11 files changed, 319 insertions(+) create mode 100644 include/net/netfilter/nft_redir.h create mode 100644 net/ipv4/netfilter/nft_redir_ipv4.c create mode 100644 net/ipv6/netfilter/nft_redir_ipv6.c create mode 100644 net/netfilter/nft_redir.c (limited to 'net/ipv4') diff --git a/include/net/netfilter/nft_redir.h b/include/net/netfilter/nft_redir.h new file mode 100644 index 000000000000..a2d67546afab --- /dev/null +++ b/include/net/netfilter/nft_redir.h @@ -0,0 +1,21 @@ +#ifndef _NFT_REDIR_H_ +#define _NFT_REDIR_H_ + +struct nft_redir { + enum nft_registers sreg_proto_min:8; + enum nft_registers sreg_proto_max:8; + u16 flags; +}; + +extern const struct nla_policy nft_redir_policy[]; + +int nft_redir_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr); + +int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data); + +#endif /* _NFT_REDIR_H_ */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f31fe7b660a5..16f62a5cf04d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -837,6 +837,22 @@ enum nft_masq_attributes { }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) +/** + * enum nft_redir_attributes - nf_tables redirect expression netlink attributes + * + * @NFTA_REDIR_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_REDIR_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_REDIR_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_redir_attributes { + NFTA_REDIR_UNSPEC, + NFTA_REDIR_REG_PROTO_MIN, + NFTA_REDIR_REG_PROTO_MAX, + NFTA_REDIR_FLAGS, + __NFTA_REDIR_MAX +}; +#define NFTA_REDIR_MAX (__NFTA_REDIR_MAX - 1) + /** * enum nft_gen_attributes - nf_tables ruleset generation attributes * diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a300e2c32b26..8358b2da1549 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -119,6 +119,15 @@ config NFT_MASQ_IPV4 This is the expression that provides IPv4 masquerading support for nf_tables. +config NFT_REDIR_IPV4 + tristate "IPv4 redirect support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_REDIR + select NF_NAT_REDIRECT_IPV4 + help + This is the expression that provides IPv4 redirect support for + nf_tables. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 34e436c92015..902bcd1597bb 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o +obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c new file mode 100644 index 000000000000..643c5967aa27 --- /dev/null +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_redir_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_ipv4_multi_range_compat mr; + unsigned int verdict; + + memset(&mr, 0, sizeof(mr)); + if (priv->sreg_proto_min) { + mr.range[0].min.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + mr.range[0].max.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + mr.range[0].flags |= priv->flags; + + verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum); + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_redir_ipv4_type; +static const struct nft_expr_ops nft_redir_ipv4_ops = { + .type = &nft_redir_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_ipv4_eval, + .init = nft_redir_init, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "redir", + .ops = &nft_redir_ipv4_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_ipv4_module_init(void) +{ + return nft_register_expr(&nft_redir_ipv4_type); +} + +static void __exit nft_redir_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_redir_ipv4_type); +} + +module_init(nft_redir_ipv4_module_init); +module_exit(nft_redir_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 462eebbf4377..0dbe5c7953e5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -97,6 +97,15 @@ config NFT_MASQ_IPV6 This is the expression that provides IPv4 masquerading support for nf_tables. +config NFT_REDIR_IPV6 + tristate "IPv6 redirect support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_REDIR + select NF_NAT_REDIRECT_IPV6 + help + This is the expression that provides IPv4 redirect support for + nf_tables. + endif # NF_NAT_IPV6 config IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 6c2baab10f21..d2ac9f5f212c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o +obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c new file mode 100644 index 000000000000..83420eeaad1c --- /dev/null +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_redir_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + memset(&range, 0, sizeof(range)); + if (priv->sreg_proto_min) { + range.min_proto.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + range.max_proto.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + range.flags |= priv->flags; + + verdict = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->ops->hooknum); + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_redir_ipv6_type; +static const struct nft_expr_ops nft_redir_ipv6_ops = { + .type = &nft_redir_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_ipv6_eval, + .init = nft_redir_init, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "redir", + .ops = &nft_redir_ipv6_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_ipv6_module_init(void) +{ + return nft_register_expr(&nft_redir_ipv6_type); +} + +static void __exit nft_redir_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_redir_ipv6_type); +} + +module_init(nft_redir_ipv6_module_init); +module_exit(nft_redir_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 49deb4edbac6..373486ae4159 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -505,6 +505,15 @@ config NFT_MASQ This option adds the "masquerade" expression that you can use to perform NAT in the masquerade flavour. +config NFT_REDIR + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables redirect support" + help + This options adds the "redirect" expression that you can use + to perform NAT in the redirect flavour. + config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index a9571be3f791..f3eb4680f2ec 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o +obj-$(CONFIG_NFT_REDIR) += nft_redir.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c new file mode 100644 index 000000000000..e27b4e35718a --- /dev/null +++ b/net/netfilter/nft_redir.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { + [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_redir_policy); + +int nft_redir_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_redir *priv = nft_expr_priv(expr); + u32 nla_be32; + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + + if (tb[NFTA_REDIR_REG_PROTO_MIN]) { + nla_be32 = nla_get_be32(tb[NFTA_REDIR_REG_PROTO_MIN]); + priv->sreg_proto_min = ntohl(nla_be32); + err = nft_validate_input_register(priv->sreg_proto_min); + if (err < 0) + return err; + + if (tb[NFTA_REDIR_REG_PROTO_MAX]) { + nla_be32 = nla_get_be32(tb[NFTA_REDIR_REG_PROTO_MAX]); + priv->sreg_proto_max = ntohl(nla_be32); + err = nft_validate_input_register(priv->sreg_proto_max); + if (err < 0) + return err; + } else { + priv->sreg_proto_max = priv->sreg_proto_min; + } + } + + if (tb[NFTA_REDIR_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_redir_init); + +int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_redir *priv = nft_expr_priv(expr); + + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_REDIR_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_REDIR_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } + + if (priv->flags != 0 && + nla_put_be32(skb, NFTA_REDIR_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_redir_dump); + +int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); +} +EXPORT_SYMBOL_GPL(nft_redir_validate); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); -- cgit v1.2.3 From dca145ffaa8d39ea1904491ac81b92b7049372c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Oct 2014 21:45:24 -0700 Subject: tcp: allow for bigger reordering level While testing upcoming Yaogong patch (converting out of order queue into an RB tree), I hit the max reordering level of linux TCP stack. Reordering level was limited to 127 for no good reason, and some network setups [1] can easily reach this limit and get limited throughput. Allow a new max limit of 300, and add a sysctl to allow admins to even allow bigger (or lower) values if needed. [1] Aggregation of links, per packet load balancing, fabrics not doing deep packet inspections, alternative TCP congestion modules... Signed-off-by: Eric Dumazet Cc: Yaogong Wang Signed-off-by: David S. Miller --- Documentation/networking/bonding.txt | 7 ++----- Documentation/networking/ip-sysctl.txt | 10 +++++++++- include/linux/tcp.h | 4 ++-- include/net/tcp.h | 4 +--- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_input.c | 3 ++- 6 files changed, 23 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index eeb5b2e97bed..83bf4986baea 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single It is possible to adjust TCP/IP's congestion limits by altering the net.ipv4.tcp_reordering sysctl parameter. The - usual default value is 3, and the maximum useful value is 127. - For a four interface balance-rr bond, expect that a single - TCP/IP stream will utilize no more than approximately 2.3 - interface's worth of throughput, even after adjusting - tcp_reordering. + usual default value is 3. But keep in mind TCP stack is able + to automatically increase this when it detects reorders. Note that the fraction of packets that will be delivered out of order is highly variable, and is unlikely to be zero. The level diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 0307e2875f21..9028b879a97b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -376,9 +376,17 @@ tcp_orphan_retries - INTEGER may consume significant resources. Cf. tcp_max_orphans. tcp_reordering - INTEGER - Maximal reordering of packets in a TCP stream. + Initial reordering level of packets in a TCP stream. + TCP stack can then dynamically adjust flow reordering level + between this initial value and tcp_max_reordering Default: 3 +tcp_max_reordering - INTEGER + Maximal reordering level of packets in a TCP stream. + 300 is a fairly conservative value, but you might increase it + if paths are using per packet load balancing (like bonding rr mode) + Default: 300 + tcp_retrans_collapse - BOOLEAN Bug-to-bug compatibility with some broken printers. On retransmit try to send bigger packets to work around bugs in diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c2dee7deefa8..f566b8567892 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -204,10 +204,10 @@ struct tcp_sock { u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ - u8 reordering; /* Packet reordering metric. */ + u8 keepalive_probes; /* num of allowed keep alive probes */ + u32 reordering; /* Packet reordering metric. */ u32 snd_up; /* Urgent pointer */ - u8 keepalive_probes; /* num of allowed keep alive probes */ /* * Options received (usually on last packet, some only on SYN packets). */ diff --git a/include/net/tcp.h b/include/net/tcp.h index c73fc145ee45..3a35b1500359 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -70,9 +70,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 -/* Maximal reordering. */ -#define TCP_MAX_REORDERING 127 - /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U @@ -252,6 +249,7 @@ extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; +extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b3c53c8b331e..e0ee384a448f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_max_reordering", + .data = &sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a12b455928e5..9a18cdd633f3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_max_reordering __read_mostly = 300; EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > tp->reordering) { int mib_idx; - tp->reordering = min(TCP_MAX_REORDERING, metric); + tp->reordering = min(sysctl_tcp_max_reordering, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) -- cgit v1.2.3 From 8ac2bde2a4a05c38e2bd733bea94507cb1461e06 Mon Sep 17 00:00:00 2001 From: Marcelo Leitner Date: Wed, 29 Oct 2014 10:51:13 -0200 Subject: netfilter: log: protect nf_log_register against double registering Currently, despite the comment right before the function, nf_log_register allows registering two loggers on with the same type and end up overwriting the previous register. Not a real issue today as current tree doesn't have two loggers for the same type but it's better to get this protected. Also make sure that all of its callers do error checking. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_log_arp.c | 12 +++++++++++- net/ipv4/netfilter/nf_log_ipv4.c | 12 +++++++++++- net/ipv6/netfilter/nf_log_ipv6.c | 12 +++++++++++- net/netfilter/nf_log.c | 16 +++++++++++++--- 4 files changed, 46 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index ccfc78db12ee..0c8799a0c9e4 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -10,6 +10,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -130,8 +131,17 @@ static int __init nf_log_arp_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_ARP, &nf_arp_logger); + ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_arp_net_ops); + return ret; } static void __exit nf_log_arp_exit(void) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 078bdca1b607..75101980eeee 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -5,6 +5,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -366,8 +367,17 @@ static int __init nf_log_ipv4_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_ipv4_net_ops); + return ret; } static void __exit nf_log_ipv4_exit(void) diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 7b17a0be93e7..7fc34d1681a1 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -5,6 +5,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -398,8 +399,17 @@ static int __init nf_log_ipv6_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_ipv6_net_ops); + return ret; } static void __exit nf_log_ipv6_exit(void) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 5eaf047ed37f..9562e393fdf7 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -75,6 +75,7 @@ EXPORT_SYMBOL(nf_log_unset); int nf_log_register(u_int8_t pf, struct nf_logger *logger) { int i; + int ret = 0; if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; @@ -82,16 +83,25 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) mutex_lock(&nf_log_mutex); if (pf == NFPROTO_UNSPEC) { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + if (rcu_access_pointer(loggers[i][logger->type])) { + ret = -EEXIST; + goto unlock; + } + } for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) rcu_assign_pointer(loggers[i][logger->type], logger); } else { - /* register at end of list to honor first register win */ + if (rcu_access_pointer(loggers[pf][logger->type])) { + ret = -EEXIST; + goto unlock; + } rcu_assign_pointer(loggers[pf][logger->type], logger); } +unlock: mutex_unlock(&nf_log_mutex); - - return 0; + return ret; } EXPORT_SYMBOL(nf_log_register); -- cgit v1.2.3 From f4e715c3254e3c0167b5d4272901a2b248b65ad2 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 29 Oct 2014 16:05:06 -0700 Subject: ipv4: minor spelling fixes Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 2 +- net/ipv4/tcp_input.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 065cd94c640c..91861fe77ed1 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -104,7 +104,7 @@ static void geneve_build_header(struct genevehdr *geneveh, memcpy(geneveh->options, options, options_len); } -/* Transmit a fully formated Geneve frame. +/* Transmit a fully formatted Geneve frame. * * When calling this function. The skb->data should point * to the geneve header which is fully formed. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9a18cdd633f3..df8b3c12a173 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5866,7 +5866,7 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable - * TCP ECN negociation. + * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control; it requires setting ECT on all packets, -- cgit v1.2.3 From 646697b9e3fef913bb6393ebfb6115c442a96be7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Oct 2014 02:55:38 +0100 Subject: syncookies: only increment SYNCOOKIESFAILED on validation error Only count packets that failed cookie-authentication. We can get SYNCOOKIESFAILED > 0 while we never even sent a single cookie. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 7 +++++-- net/ipv6/syncookies.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 32b98d0207b4..4ac7bcaf2f46 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -275,8 +275,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; - if (tcp_synq_no_recent_overflow(sk) || - (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { + if (tcp_synq_no_recent_overflow(sk)) + goto out; + + mss = __cookie_v4_check(ip_hdr(skb), th, cookie); + if (mss == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 0e26e795b703..be291baa2ec2 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -171,8 +171,11 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; - if (tcp_synq_no_recent_overflow(sk) || - (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { + if (tcp_synq_no_recent_overflow(sk)) + goto out; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie); + if (mss == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } -- cgit v1.2.3 From cd2145358e7a5bb1798a185e5ef199ea49c69dd7 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 30 Oct 2014 12:48:08 -0400 Subject: tcp: Correction to RFC number in comment Challenge ACK is described in RFC 5961, fix typo. Signed-off-by: Sowmini Varadhan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index df8b3c12a173..4e4617e90417 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5029,7 +5029,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ /* step 4: Check for a SYN - * RFC 5691 4.2 : Send a challenge ack + * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { syn_challenge: -- cgit v1.2.3 From 4973404f81b5474748d39ff6f6beb6b1bc5e76f3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:10:01 +0100 Subject: cipso: kerneldoc warning fix Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4715f25dfe03..e24df5a23446 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -539,7 +539,7 @@ doi_add_return: /** * cipso_v4_doi_free - Frees a DOI definition - * @entry: the entry's RCU field + * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. -- cgit v1.2.3 From 4c787b162607fa4ee9da021a8683d13dd5c7cb9f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:13:50 +0100 Subject: ipv4: include linux/bug.h instead of asm/bug.h Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e24df5a23446..253a61b4e69e 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include /* List of available DOI definitions */ -- cgit v1.2.3 From 988b13438c0c6893192d4ca082eb4df39222b07b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:19:19 +0100 Subject: cipso: remove NULL assignment on static Also add blank line after structure declarations Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 253a61b4e69e..5160c710f2eb 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -72,6 +72,7 @@ struct cipso_v4_map_cache_bkt { u32 size; struct list_head list; }; + struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; @@ -82,7 +83,8 @@ struct cipso_v4_map_cache_entry { u32 activity; struct list_head list; }; -static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL; + +static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt = 0; -- cgit v1.2.3 From b92022f3e507d4d8415b34d6ac81adf2d45c2071 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:25:38 +0100 Subject: tcp: spelling s/plugable/pluggable Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index b1c5970d47a1..27ead0dd16bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,5 +1,5 @@ /* - * Plugable TCP congestion control support and newReno + * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * -- cgit v1.2.3 From c9f503b00622525bb9a549ff38c4bb20b4287b84 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:31:26 +0100 Subject: ipv4: use seq_puts instead of seq_printf where possible Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8e3eb39f84e7..f0d4eb8b99b9 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -296,12 +296,12 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, int j; if (count) { - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } @@ -342,7 +342,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_printf(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", -- cgit v1.2.3 From 0d3979b9c711bf823d65e8abe9ab2e8bff0669fa Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:37:46 +0100 Subject: ipv4: remove 0/NULL assignment on static static values are automatically initialized to 0 Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index a896da50f398..7fa18bc7e47f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -115,7 +115,7 @@ */ int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ -static int ic_enable __initdata = 0; /* IP config enabled? */ +static int ic_enable __initdata; /* IP config enabled? */ /* Protocol choice */ int ic_proto_enabled __initdata = 0 @@ -130,7 +130,7 @@ int ic_proto_enabled __initdata = 0 #endif ; -static int ic_host_name_set __initdata = 0; /* Host name set by us? */ +static int ic_host_name_set __initdata; /* Host name set by us? */ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ @@ -160,17 +160,17 @@ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; /* Protocols supported by available interfaces */ -static int ic_proto_have_if __initdata = 0; +static int ic_proto_have_if __initdata; /* MTU for boot device */ -static int ic_dev_mtu __initdata = 0; +static int ic_dev_mtu __initdata; #ifdef IPCONFIG_DYNAMIC static DEFINE_SPINLOCK(ic_recv_lock); -static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +static volatile int ic_got_reply __initdata; /* Proto(s) that replied */ #endif #ifdef IPCONFIG_DHCP -static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */ #endif @@ -186,8 +186,8 @@ struct ic_device { __be32 xid; }; -static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ -static struct net_device *ic_dev __initdata = NULL; /* Selected device */ +static struct ic_device *ic_first_dev __initdata; /* List of open device */ +static struct net_device *ic_dev __initdata; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { -- cgit v1.2.3 From aa1f731e52807077e9e13a86c0cad12d442c8fd4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:44:04 +0100 Subject: inet: frags: remove inline on static in c file remove __inline__ / inline and let compiler decide what to do with static functions Inspired-by: "David S. Miller" Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2811cc18701a..4d964dadd655 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -80,7 +80,7 @@ struct ipq { struct inet_peer *peer; }; -static inline u8 ip4_frag_ecn(u8 tos) +static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } @@ -148,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } -static __inline__ void ip4_frag_free(struct inet_frag_queue *q) +static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; @@ -160,7 +160,7 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q) /* Destruction primitives. */ -static __inline__ void ipq_put(struct ipq *ipq) +static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q, &ip4_frags); } @@ -236,7 +236,7 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -256,7 +256,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) } /* Is the fragment too far ahead to be part of ipq? */ -static inline int ip_frag_too_far(struct ipq *qp) +static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = sysctl_ipfrag_max_dist; @@ -795,16 +795,16 @@ static void __init ip4_frags_ctl_register(void) register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else -static inline int ip4_frags_ns_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ns_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void __init ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { } #endif -- cgit v1.2.3 From c18450a52a10a5c4cea3dc426c40447a7152290f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:48:41 +0100 Subject: udp: remove else after return else is unnecessary after return 0 in __udp4_lib_rcv() Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd0db5471bb5..cf0cece50b6c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1777,14 +1777,14 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (ret > 0) return -ret; return 0; - } else { - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); - - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); } + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); + + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + if (sk != NULL) { int ret; -- cgit v1.2.3 From 436f7c206860729d543a457aca5887e52039a5f4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 20:52:14 +0100 Subject: igmp: remove camel case definitions use standard uppercase for definitions Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fb70e3ecc3e4..3f8051358202 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -112,17 +112,17 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) -#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) -#define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Query_Robustness_Variable 2 +#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) +#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) +#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 -#define IGMP_Initial_Report_Delay (1) +#define IGMP_INITIAL_REPORT_DELAY (1) -/* IGMP_Initial_Report_Delay is not from IGMP specs! +/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not @@ -879,15 +879,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (ih->code == 0) { /* Alas, old v1 router presents here. */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_Router_Present_Timeout; + IGMP_V1_ROUTER_PRESENT_TIMEOUT; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_Router_Present_Timeout; + IGMP_V2_ROUTER_PRESENT_TIMEOUT; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -899,7 +899,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; @@ -1218,7 +1218,7 @@ static void igmp_group_added(struct ip_mc_list *im) return; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); - igmp_start_timer(im, IGMP_Initial_Report_Delay); + igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } @@ -1541,7 +1541,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; #ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; #endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, -- cgit v1.2.3 From 274e2da0ecb4d28e3d4e9f029b096e0e8c401a66 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Nov 2014 17:35:01 +0100 Subject: syncookies: avoid magic values and document which-bit-is-what-option Was a bit more difficult to read than needed due to magic shifts; add defines and document the used encoding scheme. Joint work with Daniel Borkmann. Acked-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4ac7bcaf2f46..c3792c0557dd 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,10 +19,6 @@ #include #include -/* Timestamps: lowest bits store TCP options */ -#define TSBITS 6 -#define TSMASK (((__u32)1 << TSBITS) - 1) - extern int sysctl_tcp_syncookies; static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; @@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) +/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK + * stores TCP options: + * + * MSB LSB + * | 31 ... 6 | 5 | 4 | 3 2 1 0 | + * | Timestamp | ECN | SACK | WScale | + * + * When we receive a valid cookie-ACK, we look at the echoed tsval (if + * any) to figure out which TCP options we should use for the rebuilt + * connection. + * + * A WScale setting of '0xf' (which is an invalid scaling value) + * means that original syn did not include the TCP window scaling option. + */ +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK BIT(4) +#define TS_OPT_ECN BIT(5) +/* There is no TS_OPT_TIMESTAMP: + * if ACK contains timestamp option, we already know it was + * requested/supported by the syn/synack exchange. + */ +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) + static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); @@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req) ireq = inet_rsk(req); - options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; - options |= ireq->sack_ok << 4; - options |= ireq->ecn_ok << 5; + options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; + if (ireq->sack_ok) + options |= TS_OPT_SACK; + if (ireq->ecn_ok) + options |= TS_OPT_ECN; ts = ts_now & ~TSMASK; ts |= options; @@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * - * The lowest 4 bits store snd_wscale. - * next 2 bits indicate SACK and ECN support. - * * return false if we decode an option that should not be. */ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, struct net *net, bool *ecn_ok) { /* echoed timestamp, lowest bits contain options */ - u32 options = tcp_opt->rcv_tsecr & TSMASK; + u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); @@ -238,19 +257,20 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, if (!sysctl_tcp_timestamps) return false; - tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; - *ecn_ok = (options >> 5) & 1; + tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; + *ecn_ok = options & TS_OPT_ECN; if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) return false; if (tcp_opt->sack_ok && !sysctl_tcp_sack) return false; - if ((options & 0xf) == 0xf) + if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; - tcp_opt->snd_wscale = options & 0xf; + tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; + return sysctl_tcp_window_scaling != 0; } EXPORT_SYMBOL(cookie_check_timestamp); -- cgit v1.2.3 From f1673381b1481a409238d4552a0700d490c5b36c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Nov 2014 17:35:02 +0100 Subject: syncookies: split cookie_check_timestamp() into two functions The function cookie_check_timestamp(), both called from IPv4/6 context, is being used to decode the echoed timestamp from the SYN/ACK into TCP options used for follow-up communication with the peer. We can remove ECN handling from that function, split it into a separate one, and simply rename the original function into cookie_decode_options(). cookie_decode_options() just fills in tcp_option struct based on the echoed timestamp received from the peer. Anything that fails in this function will actually discard the request socket. While this is the natural place for decoding options such as ECN which commit 172d69e63c7f ("syncookies: add support for ECN") added, we argue that in particular for ECN handling, it can be checked at a later point in time as the request sock would actually not need to be dropped from this, but just ECN support turned off. Therefore, we split this functionality into cookie_ecn_ok(), which tells us if the timestamp indicates ECN support AND the tcp_ecn sysctl is enabled. This prepares for per-route ECN support: just looking at the tcp_ecn sysctl won't be enough anymore at that point; if the timestamp indicates ECN and sysctl tcp_ecn == 0, we will also need to check the ECN dst metric. This would mean adding a route lookup to cookie_check_timestamp(), which we definitely want to avoid. As we already do a route lookup at a later point in cookie_{v4,v6}_check(), we can simply make use of that as well for the new cookie_ecn_ok() function w/o any additional cost. Joint work with Daniel Borkmann. Acked-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++----- net/ipv4/syncookies.c | 31 +++++++++++++++++++++---------- net/ipv6/syncookies.c | 5 ++--- 3 files changed, 27 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3a35b1500359..36c5084964cd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -490,17 +490,16 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mss); -#endif - __u32 cookie_init_timestamp(struct request_sock *req); -bool cookie_check_timestamp(struct tcp_options_received *opt, struct net *net, - bool *ecn_ok); +bool cookie_timestamp_decode(struct tcp_options_received *opt); +bool cookie_ecn_ok(const struct tcp_options_received *opt, + const struct net *net); /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); -#ifdef CONFIG_SYN_COOKIES + u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c3792c0557dd..6de772500ee9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -241,10 +241,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * - * return false if we decode an option that should not be. + * return false if we decode a tcp option that is disabled + * on the host. */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, - struct net *net, bool *ecn_ok) +bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr; @@ -258,9 +258,6 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; - *ecn_ok = options & TS_OPT_ECN; - if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) - return false; if (tcp_opt->sack_ok && !sysctl_tcp_sack) return false; @@ -273,7 +270,22 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, return sysctl_tcp_window_scaling != 0; } -EXPORT_SYMBOL(cookie_check_timestamp); +EXPORT_SYMBOL(cookie_timestamp_decode); + +bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, + const struct net *net) +{ + bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; + + if (!ecn_ok) + return false; + + if (net->ipv4.sysctl_tcp_ecn) + return true; + + return false; +} +EXPORT_SYMBOL(cookie_ecn_ok); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { @@ -289,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) int mss; struct rtable *rt; __u8 rcv_wscale; - bool ecn_ok = false; struct flowi4 fl4; if (!sysctl_tcp_syncookies || !th->ack || th->rst) @@ -310,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + if (!cookie_timestamp_decode(&tcp_opt)) goto out; ret = NULL; @@ -328,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; @@ -377,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk)); ret = get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index be291baa2ec2..52cc8cb02c0c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -166,7 +166,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) int mss; struct dst_entry *dst; __u8 rcv_wscale; - bool ecn_ok = false; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; @@ -186,7 +185,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + if (!cookie_timestamp_decode(&tcp_opt)) goto out; ret = NULL; @@ -223,7 +222,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->expires = 0UL; req->num_retrans = 0; - ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; @@ -264,6 +262,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk)); ret = get_cookie_sock(sk, skb, req, dst); out: -- cgit v1.2.3 From f7b3bec6f5167efaf56b756abfafb924cb1d3050 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Nov 2014 17:35:03 +0100 Subject: net: allow setting ecn via routing table This patch allows to set ECN on a per-route basis in case the sysctl tcp_ecn is not set to 1. In other words, when ECN is set for specific routes, it provides a tcp_ecn=1 behaviour for that route while the rest of the stack acts according to the global settings. One can use 'ip route change dev $dev $net features ecn' to toggle this. Having a more fine-grained per-route setting can be beneficial for various reasons, for example, 1) within data centers, or 2) local ISPs may deploy ECN support for their own video/streaming services [1], etc. There was a recent measurement study/paper [2] which scanned the Alexa's publicly available top million websites list from a vantage point in US, Europe and Asia: Half of the Alexa list will now happily use ECN (tcp_ecn=2, most likely blamed to commit 255cac91c3 ("tcp: extend ECN sysctl to allow server-side only ECN") ;)); the break in connectivity on-path was found is about 1 in 10,000 cases. Timeouts rather than receiving back RSTs were much more common in the negotiation phase (and mostly seen in the Alexa middle band, ranks around 50k-150k): from 12-thousand hosts on which there _may_ be ECN-linked connection failures, only 79 failed with RST when _not_ failing with RST when ECN is not requested. It's unclear though, how much equipment in the wild actually marks CE when buffers start to fill up. We thought about a fallback to non-ECN for retransmitted SYNs as another global option (which could perhaps one day be made default), but as Eric points out, there's much more work needed to detect broken middleboxes. Two examples Eric mentioned are buggy firewalls that accept only a single SYN per flow, and middleboxes that successfully let an ECN flow establish, but later mark CE for all packets (so cwnd converges to 1). [1] http://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf, p.15 [2] http://ecn.ethz.ch/ Joint work with Daniel Borkmann. Reference: http://thread.gmane.org/gmane.linux.network/335797 Suggested-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/syncookies.c | 6 +++--- net/ipv4/tcp_input.c | 25 +++++++++++++++---------- net/ipv4/tcp_output.c | 13 +++++++++++-- net/ipv6/syncookies.c | 2 +- 5 files changed, 31 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 36c5084964cd..f50f29faf76f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -493,7 +493,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, __u32 cookie_init_timestamp(struct request_sock *req); bool cookie_timestamp_decode(struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, - const struct net *net); + const struct net *net, const struct dst_entry *dst); /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6de772500ee9..45fe60c5238e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -273,7 +273,7 @@ bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) EXPORT_SYMBOL(cookie_timestamp_decode); bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, - const struct net *net) + const struct net *net, const struct dst_entry *dst) { bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; @@ -283,7 +283,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, if (net->ipv4.sysctl_tcp_ecn) return true; - return false; + return dst_feature(dst, RTAX_FEATURE_ECN); } EXPORT_SYMBOL(cookie_ecn_ok); @@ -387,7 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk)); + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); ret = get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e4617e90417..196b4388116c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5876,20 +5876,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) + const struct sock *listen_sk, + const struct dst_entry *dst) { const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; + bool ect, need_ecn, ecn_ok; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); need_ecn = tcp_ca_needs_ecn(listen_sk); + ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + if (!ect && !need_ecn && ecn_ok) inet_rsk(req)->ecn_ok = 1; else if (ect && need_ecn) inet_rsk(req)->ecn_ok = 1; @@ -5954,13 +5956,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; - if (!want_cookie || tmp_opt.tstamp_ok) - tcp_ecn_create_request(req, skb, sk); - - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -6008,6 +6004,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; } + tcp_ecn_create_request(req, skb, sk, dst); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + if (!tmp_opt.tstamp_ok) + inet_rsk(req)->ecn_ok = 0; + } + tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3d453b94747..0b88158dd4a7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk); + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || - tcp_ca_needs_ecn(sk)) { + + if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 52cc8cb02c0c..7337fc7947e2 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -262,7 +262,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk)); + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); ret = get_cookie_sock(sk, skb, req, dst); out: -- cgit v1.2.3 From 05006e8c59050b2bab1bfe9cac631505d21122a3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 22:54:36 +0100 Subject: esp4: remove assignment in if condition Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 360b565918c4..d2bf02e0a678 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -392,8 +392,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0) goto out; - if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + err = skb_cow_data(skb, 0, &trailer); + if (err < 0) goto out; + nfrags = err; assoclen = sizeof(*esph); -- cgit v1.2.3 From 6cf1093e58f5c103840cf361721b6e346ec2275d Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 4 Nov 2014 23:11:19 +0100 Subject: udp: remove blank line between set and test Suggested-by: Joe Perches Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/udp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf0cece50b6c..3f001db72351 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1784,7 +1784,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - if (sk != NULL) { int ret; -- cgit v1.2.3 From 63487babf08d6d67483c67ed21d8cea6674a44ec Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:51 -0800 Subject: net: Move fou_build_header into fou.c and refactor Move fou_build_header out of ip_tunnel.c and into fou.c splitting it up into fou_build_header, gue_build_header, and fou_build_udp. This allows for other users for TX of FOU or GUE. Change ip_tunnel_encap to call fou_build_header or gue_build_header based on the tunnel encapsulation type. Similarly, added fou_encap_hlen and gue_encap_hlen functions which are called by ip_encap_hlen. New net/fou.h has prototypes and defines for this. Added NET_FOU_IP_TUNNELS configuration. When this is set, IP tunnels can use FOU/GUE and fou module is also selected. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/fou.h | 26 +++++++++++++++++++ net/ipv4/Kconfig | 9 +++++++ net/ipv4/fou.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_tunnel.c | 61 +++++++++---------------------------------- 4 files changed, 120 insertions(+), 49 deletions(-) create mode 100644 include/net/fou.h (limited to 'net/ipv4') diff --git a/include/net/fou.h b/include/net/fou.h new file mode 100644 index 000000000000..cf4ce8874f92 --- /dev/null +++ b/include/net/fou.h @@ -0,0 +1,26 @@ +#ifndef __NET_FOU_H +#define __NET_FOU_H + +#include + +#include +#include +#include +#include + +int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4); +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4); + +static size_t fou_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct udphdr); +} + +static size_t gue_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct udphdr) + sizeof(struct guehdr); +} + +#endif diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e682b48e0709..bd2901604842 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -322,6 +322,15 @@ config NET_FOU network mechanisms and optimizations for UDP (such as ECMP and RSS) can be leveraged to provide better service. +config NET_FOU_IP_TUNNELS + bool "IP: FOU encapsulation of IP tunnels" + depends on NET_IPIP || NET_IPGRE || IPV6_SIT + select NET_FOU + ---help--- + Allow configuration of FOU or GUE encapsulation for IP tunnels. + When this option is enabled IP tunnels can be configured to use + FOU or GUE encapsulation. + config GENEVE tristate "Generic Network Virtualization Encapsulation (Geneve)" depends on INET diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 32e78924e246..5446c1c8c26c 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -487,6 +487,79 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + __be16 sport; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(fou_build_header); + +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + struct guehdr *guehdr; + size_t hdr_len = sizeof(struct guehdr); + __be16 sport; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get source port (based on flow hash) before skb_push */ + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + skb_push(skb, hdr_len); + + guehdr = (struct guehdr *)skb->data; + + guehdr->version = 0; + guehdr->hlen = 0; + guehdr->flags = 0; + guehdr->next_hdr = *protocol; + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(gue_build_header); + static int __init fou_init(void) { int ret; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0bb8e141eacc..c3587e1c8b82 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,7 +56,10 @@ #include #include #include -#include + +#if IS_ENABLED(CONFIG_NET_FOU) +#include +#endif #if IS_ENABLED(CONFIG_IPV6) #include @@ -494,10 +497,12 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e) switch (e->type) { case TUNNEL_ENCAP_NONE: return 0; +#if IS_ENABLED(CONFIG_NET_FOU) case TUNNEL_ENCAP_FOU: - return sizeof(struct udphdr); + return fou_encap_hlen(e); case TUNNEL_ENCAP_GUE: - return sizeof(struct udphdr) + sizeof(struct guehdr); + return gue_encap_hlen(e); +#endif default: return -EINVAL; } @@ -526,60 +531,18 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t, } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); -static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - size_t hdr_len, u8 *protocol, struct flowi4 *fl4) -{ - struct udphdr *uh; - __be16 sport; - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - - skb = iptunnel_handle_offloads(skb, csum, type); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - /* Get length and hash before making space in skb */ - - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - if (e->type == TUNNEL_ENCAP_GUE) { - struct guehdr *guehdr = (struct guehdr *)&uh[1]; - - guehdr->version = 0; - guehdr->hlen = 0; - guehdr->flags = 0; - guehdr->next_hdr = *protocol; - } - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - uh->check = 0; - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; - - return 0; -} - int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, u8 *protocol, struct flowi4 *fl4) { switch (t->encap.type) { case TUNNEL_ENCAP_NONE: return 0; +#if IS_ENABLED(CONFIG_NET_FOU) case TUNNEL_ENCAP_FOU: + return fou_build_header(skb, &t->encap, protocol, fl4); case TUNNEL_ENCAP_GUE: - return fou_build_header(skb, &t->encap, t->encap_hlen, - protocol, fl4); + return gue_build_header(skb, &t->encap, protocol, fl4); +#endif default: return -EINVAL; } -- cgit v1.2.3 From 4bcb877d257c87298aedead1ffeaba0d5df1991d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:52 -0800 Subject: udp: Offload outer UDP tunnel csum if available In __skb_udp_tunnel_segment if outer UDP checksums are enabled and ip_summed is not already CHECKSUM_PARTIAL, set up checksum offload if device features allow it. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 52 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6480cea7aa53..a774711a88b9 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -29,7 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features), - __be16 new_protocol) + __be16 new_protocol, bool is_ipv6) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -39,7 +39,9 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t enc_features; int udp_offset, outer_hlen; unsigned int oldlen; - bool need_csum; + bool need_csum = !!(skb_shinfo(skb)->gso_type & + SKB_GSO_UDP_TUNNEL_CSUM); + bool offload_csum = false, dont_encap = need_csum; oldlen = (u16)~skb->len; @@ -52,10 +54,12 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + skb->encap_hdr_csum = need_csum; - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + (skb->dev->features & + (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM))); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & features; @@ -72,11 +76,21 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, do { struct udphdr *uh; int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + __be32 delta; + + if (dont_encap) { + skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; + } else { + /* Only set up inner headers if we might be offloading + * inner checksum. + */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } skb->mac_len = mac_len; + skb->protocol = protocol; skb_push(skb, outer_hlen); skb_reset_mac_header(skb); @@ -86,19 +100,25 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, uh = udp_hdr(skb); uh->len = htons(len); - if (need_csum) { - __be32 delta = htonl(oldlen + len); + if (!need_csum) + continue; + + delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); + if (offload_csum) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + } else { uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } - - skb->protocol = protocol; } while ((skb = skb->next)); out: return segs; @@ -134,7 +154,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, } segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, - protocol); + protocol, is_ipv6); out_unlock: rcu_read_unlock(); -- cgit v1.2.3 From 5024c33ac354577635c5671498891eb197f3ec4d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:53 -0800 Subject: gue: Add infrastructure for flags and options Add functions and basic definitions for processing standard flags, private flags, and control messages. This includes definitions to compute length of optional fields corresponding to a set of flags. Flag validation is in validate_gue_flags function. This checks for unknown flags, and that length of optional fields is <= length in guehdr hlen. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/gue.h | 100 ++++++++++++++++++++++++++++++++++++-- net/ipv4/fou.c | 142 ++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 189 insertions(+), 53 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/gue.h b/include/net/gue.h index b6c332788084..cb68ae843c77 100644 --- a/include/net/gue.h +++ b/include/net/gue.h @@ -1,23 +1,113 @@ #ifndef __NET_GUE_H #define __NET_GUE_H +/* Definitions for the GUE header, standard and private flags, lengths + * of optional fields are below. + * + * Diagram of GUE header: + * + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |Ver|C| Hlen | Proto/ctype | Standard flags |P| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * ~ Fields (optional) ~ + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Private flags (optional, P bit is set) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * ~ Private fields (optional) ~ + * | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * C bit indicates contol message when set, data message when unset. + * For a control message, proto/ctype is interpreted as a type of + * control message. For data messages, proto/ctype is the IP protocol + * of the next header. + * + * P bit indicates private flags field is present. The private flags + * may refer to options placed after this field. + */ + struct guehdr { union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 hlen:4, - version:4; + __u8 hlen:5, + control:1, + version:2; #elif defined (__BIG_ENDIAN_BITFIELD) - __u8 version:4, - hlen:4; + __u8 version:2, + control:1, + hlen:5; #else #error "Please fix " #endif - __u8 next_hdr; + __u8 proto_ctype; __u16 flags; }; __u32 word; }; }; +/* Standard flags in GUE header */ + +#define GUE_FLAG_PRIV htons(1<<0) /* Private flags are in options */ +#define GUE_LEN_PRIV 4 + +#define GUE_FLAGS_ALL (GUE_FLAG_PRIV) + +/* Private flags in the private option extension */ + +#define GUE_PFLAGS_ALL (0) + +/* Functions to compute options length corresponding to flags. + * If we ever have a lot of flags this can be potentially be + * converted to a more optimized algorithm (table lookup + * for instance). + */ +static inline size_t guehdr_flags_len(__be16 flags) +{ + return ((flags & GUE_FLAG_PRIV) ? GUE_LEN_PRIV : 0); +} + +static inline size_t guehdr_priv_flags_len(__be32 flags) +{ + return 0; +} + +/* Validate standard and private flags. Returns non-zero (meaning invalid) + * if there is an unknown standard or private flags, or the options length for + * the flags exceeds the options length specific in hlen of the GUE header. + */ +static inline int validate_gue_flags(struct guehdr *guehdr, + size_t optlen) +{ + size_t len; + __be32 flags = guehdr->flags; + + if (flags & ~GUE_FLAGS_ALL) + return 1; + + len = guehdr_flags_len(flags); + if (len > optlen) + return 1; + + if (flags & GUE_FLAG_PRIV) { + /* Private flags are last four bytes accounted in + * guehdr_flags_len + */ + flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV); + + if (flags & ~GUE_PFLAGS_ALL) + return 1; + + len += guehdr_priv_flags_len(flags); + if (len > optlen) + return 1; + } + + return 0; +} + #endif diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 5446c1c8c26c..a3b8c5b36303 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -38,21 +38,17 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static int fou_udp_encap_recv_deliver(struct sk_buff *skb, - u8 protocol, size_t len) +static void fou_recv_pull(struct sk_buff *skb, size_t len) { struct iphdr *iph = ip_hdr(skb); /* Remove 'len' bytes from the packet (UDP header and - * FOU header if present), modify the protocol to the one - * we found, and then call rcv_encap. + * FOU header if present). */ iph->tot_len = htons(ntohs(iph->tot_len) - len); __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); - - return -protocol; } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -62,16 +58,24 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - return fou_udp_encap_recv_deliver(skb, fou->protocol, - sizeof(struct udphdr)); + fou_recv_pull(skb, sizeof(struct udphdr)); + + return -fou->protocol; +} + +static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) +{ + /* No support yet */ + kfree_skb(skb); + return 0; } static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); - size_t len; + size_t len, optlen, hdrlen; struct guehdr *guehdr; - struct udphdr *uh; + void *data; if (!fou) return 1; @@ -80,25 +84,38 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + optlen = guehdr->hlen << 2; + len += optlen; - len += guehdr->hlen << 2; if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + /* guehdr may change after pull */ + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - if (guehdr->version != 0) - goto drop; + hdrlen = sizeof(struct guehdr) + optlen; - if (guehdr->flags) { - /* No support yet */ + if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto drop; + + /* Pull UDP and GUE headers */ + fou_recv_pull(skb, len); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + data += GUE_LEN_PRIV; + + /* Process private flags */ } - return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); + if (unlikely(guehdr->control)) + return gue_control_message(skb, guehdr); + + return -guehdr->proto_ctype; + drop: kfree_skb(skb); return 0; @@ -154,36 +171,47 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, const struct net_offload *ops; struct sk_buff **pp = NULL; struct sk_buff *p; - u8 proto; struct guehdr *guehdr; - unsigned int hlen, guehlen; - unsigned int off; + size_t len, optlen, hdrlen, off; + void *data; int flush = 1; off = skb_gro_offset(skb); - hlen = off + sizeof(*guehdr); + len = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; } - proto = guehdr->next_hdr; + optlen = guehdr->hlen << 2; + len += optlen; - rcu_read_lock(); - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) - goto out_unlock; + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); + if (unlikely(!guehdr)) + goto out; + } - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + if (unlikely(guehdr->control) || guehdr->version != 0 || + validate_gue_flags(guehdr, optlen)) + goto out; - hlen = off + guehlen; - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!guehdr)) - goto out_unlock; + hdrlen = sizeof(*guehdr) + optlen; + + skb_gro_pull(skb, hdrlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, guehdr, hdrlen); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + data += GUE_LEN_PRIV; + + /* Process private flags */ } flush = 0; @@ -197,7 +225,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, guehdr2 = (struct guehdr *)(p->data + off); /* Compare base GUE header to be equal (covers - * hlen, version, next_hdr, and flags. + * hlen, version, proto_ctype, and flags. */ if (guehdr->word != guehdr2->word) { NAPI_GRO_CB(p)->same_flow = 0; @@ -212,10 +240,11 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } - skb_gro_pull(skb, guehlen); - - /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ - skb_gro_postpull_rcsum(skb, guehdr, guehlen); + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[guehdr->proto_ctype]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); @@ -236,7 +265,7 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff) u8 proto; int err = -ENOENT; - proto = guehdr->next_hdr; + proto = guehdr->proto_ctype; guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); @@ -533,8 +562,12 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; - size_t hdr_len = sizeof(struct guehdr); + size_t optlen = 0; __be16 sport; + void *data; + bool need_priv = false; + + optlen += need_priv ? GUE_LEN_PRIV : 0; skb = iptunnel_handle_offloads(skb, csum, type); @@ -545,14 +578,27 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); - skb_push(skb, hdr_len); + skb_push(skb, sizeof(struct guehdr) + optlen); guehdr = (struct guehdr *)skb->data; + guehdr->control = 0; guehdr->version = 0; - guehdr->hlen = 0; + guehdr->hlen = optlen >> 2; guehdr->flags = 0; - guehdr->next_hdr = *protocol; + guehdr->proto_ctype = *protocol; + + data = &guehdr[1]; + + if (need_priv) { + __be32 *flags = data; + + guehdr->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + /* Add private flags */ + } fou_build_udp(skb, e, fl4, protocol, sport); -- cgit v1.2.3 From e585f23636370320bc2071ca5ba2744ae37c3e51 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:54 -0800 Subject: udp: Changes to udp_offload to support remote checksum offload Add a new GSO type, SKB_GSO_TUNNEL_REMCSUM, which indicates remote checksum offload being done (in this case inner checksum must not be offloaded to the NIC). Added logic in __skb_udp_tunnel_segment to handle remote checksum offload case. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 4 +++- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 4 +++- net/core/skbuff.c | 4 ++-- net/ipv4/af_inet.c | 1 + net/ipv4/tcp_offload.c | 1 + net/ipv4/udp_offload.c | 18 ++++++++++++++++-- net/ipv6/ip6_offload.c | 1 + net/ipv6/udp_offload.c | 1 + 9 files changed, 29 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index dcfdecbfa0b7..8c94b07e654a 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -48,8 +48,9 @@ enum { NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ + NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_MPLS_BIT, + NETIF_F_GSO_TUNNEL_REMCSUM_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CSUM_BIT, /* SCTP checksum offload */ @@ -119,6 +120,7 @@ enum { #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM) #define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) +#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ed05bd764dc..4767f546d7c0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3584,6 +3584,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_MPLS != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ad9675b6fe1..74ed34413969 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -373,6 +373,7 @@ enum { SKB_GSO_MPLS = 1 << 12, + SKB_GSO_TUNNEL_REMCSUM = 1 << 13, }; #if BITS_PER_LONG > 32 @@ -603,7 +604,8 @@ struct sk_buff { #endif __u8 ipvs_property:1; __u8 inner_protocol_type:1; - /* 4 or 6 bit hole */ + __u8 remcsum_offload:1; + /* 3 or 5 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e48e5c02e877..700189604f3d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3013,7 +3013,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg) { + if (!sg && !nskb->remcsum_offload) { nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), @@ -3085,7 +3085,7 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum) { + if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b7fe5b03906..ed2c672c5b01 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1222,6 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_MPLS | 0))) goto out; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5b90f2f447a5..a1b2a5624f91 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -97,6 +97,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a774711a88b9..0a5a70d0e84c 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -41,7 +41,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, unsigned int oldlen; bool need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - bool offload_csum = false, dont_encap = need_csum; + bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + bool offload_csum = false, dont_encap = (need_csum || remcsum); oldlen = (u16)~skb->len; @@ -55,6 +56,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; skb->encap_hdr_csum = need_csum; + skb->remcsum_offload = remcsum; /* Try to offload checksum if possible */ offload_csum = !!(need_csum && @@ -108,11 +110,22 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, uh->check = ~csum_fold((__force __wsum) ((__force u32)uh->check + (__force u32)delta)); - if (offload_csum) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); + } else if (remcsum) { + /* Need to calculate checksum from scratch, + * inner checksums are never when doing + * remote_checksum_offload. + */ + + skb->csum = skb_checksum(skb, udp_offset, + skb->len - udp_offset, + 0); + uh->check = csum_fold(skb->csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; } else { uh->check = gso_make_checksum(skb, ~uh->check); @@ -192,6 +205,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_IPIP | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_MPLS) || diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index a071563a7e6e..e9767079a360 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -78,6 +78,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_SIT | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 6b8f543f6ac6..637ba2e438b7 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,6 +42,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | -- cgit v1.2.3 From b17f709a24013fcbb257f6f89b4d81ac9fdf0d18 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:56 -0800 Subject: gue: TX support for using remote checksum offload option Add if_tunnel flag TUNNEL_ENCAP_FLAG_REMCSUM to configure remote checksum offload on an IP tunnel. Add logic in gue_build_header to insert remote checksum offload option. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/fou.h | 14 +++++++++++++- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/fou.c | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/fou.h b/include/net/fou.h index cf4ce8874f92..25b26ffcf1df 100644 --- a/include/net/fou.h +++ b/include/net/fou.h @@ -20,7 +20,19 @@ static size_t fou_encap_hlen(struct ip_tunnel_encap *e) static size_t gue_encap_hlen(struct ip_tunnel_encap *e) { - return sizeof(struct udphdr) + sizeof(struct guehdr); + size_t len; + bool need_priv = false; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + + if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { + len += GUE_PLEN_REMCSUM; + need_priv = true; + } + + len += need_priv ? GUE_LEN_PRIV : 0; + + return len; } #endif diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 280d9e092283..bd3cc11a431f 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -69,6 +69,7 @@ enum tunnel_encap_types { #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) #define TUNNEL_ENCAP_FLAG_CSUM6 (1<<1) +#define TUNNEL_ENCAP_FLAG_REMCSUM (1<<2) /* SIT-mode i_flags */ #define SIT_ISATAP 0x0001 diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a3b8c5b36303..fb0db99adf9e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -562,11 +562,19 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; - size_t optlen = 0; + size_t hdrlen, optlen = 0; __be16 sport; void *data; bool need_priv = false; + if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + csum = false; + optlen += GUE_PLEN_REMCSUM; + type |= SKB_GSO_TUNNEL_REMCSUM; + need_priv = true; + } + optlen += need_priv ? GUE_LEN_PRIV : 0; skb = iptunnel_handle_offloads(skb, csum, type); @@ -578,7 +586,9 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); - skb_push(skb, sizeof(struct guehdr) + optlen); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); guehdr = (struct guehdr *)skb->data; @@ -597,7 +607,26 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, *flags = 0; data += GUE_LEN_PRIV; - /* Add private flags */ + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd = data; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + } fou_build_udp(skb, e, fl4, protocol, sport); -- cgit v1.2.3 From a8d31c128bf574bed2fa29e0512b24d446018a50 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:57 -0800 Subject: gue: Receive side of remote checksum offload Add processing of the remote checksum offload option in both the normal path as well as the GRO path. The implements patching the affected checksum to derive the offloaded checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/fou.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 161 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index fb0db99adf9e..740ae099a0d9 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -63,6 +63,59 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) return -fou->protocol; } +static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, + void *data, int hdrlen, u8 ipproto) +{ + __be16 *pd = data; + u16 start = ntohs(pd[0]); + u16 offset = ntohs(pd[1]); + u16 poffset = 0; + u16 plen; + __wsum csum, delta; + __sum16 *psum; + + if (skb->remcsum_offload) { + /* Already processed in GRO path */ + skb->remcsum_offload = 0; + return guehdr; + } + + if (start > skb->len - hdrlen || + offset > skb->len - hdrlen - sizeof(u16)) + return NULL; + + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) + __skb_checksum_complete(skb); + + plen = hdrlen + offset + sizeof(u16); + if (!pskb_may_pull(skb, plen)) + return NULL; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + if (ipproto == IPPROTO_IP && sizeof(struct iphdr) < plen) { + struct iphdr *ip = (struct iphdr *)(skb->data + hdrlen); + + /* If next header happens to be IP we can skip that for the + * checksum calculation since the IP header checksum is zero + * if correct. + */ + poffset = ip->ihl * 4; + } + + csum = csum_sub(skb->csum, skb_checksum(skb, poffset + hdrlen, + start - poffset - hdrlen, 0)); + + /* Set derived checksum in packet */ + psum = (__sum16 *)(skb->data + hdrlen + offset); + delta = csum_sub(csum_fold(csum), *psum); + *psum = csum_fold(csum); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + + return guehdr; +} + static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) { /* No support yet */ @@ -76,6 +129,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) size_t len, optlen, hdrlen; struct guehdr *guehdr; void *data; + u16 doffset = 0; if (!fou) return 1; @@ -100,20 +154,43 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto drop; - /* Pull UDP and GUE headers */ - fou_recv_pull(skb, len); + hdrlen = sizeof(struct guehdr) + optlen; + + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + + /* Pull UDP header now, skb->data points to guehdr */ + __skb_pull(skb, sizeof(struct udphdr)); + + /* Pull csum through the guehdr now . This can be used if + * there is a remote checksum offload. + */ + skb_postpull_rcsum(skb, udp_hdr(skb), len); data = &guehdr[1]; if (guehdr->flags & GUE_FLAG_PRIV) { - data += GUE_LEN_PRIV; + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; - /* Process private flags */ + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_remcsum(skb, guehdr, data + doffset, + hdrlen, guehdr->proto_ctype); + if (!guehdr) + goto drop; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); + __skb_pull(skb, hdrlen); + skb_reset_transport_header(skb); + return -guehdr->proto_ctype; drop: @@ -164,6 +241,66 @@ out_unlock: return err; } +static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, + struct guehdr *guehdr, void *data, + size_t hdrlen, u8 ipproto) +{ + __be16 *pd = data; + u16 start = ntohs(pd[0]); + u16 offset = ntohs(pd[1]); + u16 poffset = 0; + u16 plen; + void *ptr; + __wsum csum, delta; + __sum16 *psum; + + if (skb->remcsum_offload) + return guehdr; + + if (start > skb_gro_len(skb) - hdrlen || + offset > skb_gro_len(skb) - hdrlen - sizeof(u16) || + !NAPI_GRO_CB(skb)->csum_valid || skb->remcsum_offload) + return NULL; + + plen = hdrlen + offset + sizeof(u16); + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + guehdr = skb_gro_header_slow(skb, off + plen, off); + if (!guehdr) + return NULL; + } + + ptr = (void *)guehdr + hdrlen; + + if (ipproto == IPPROTO_IP && + (hdrlen + sizeof(struct iphdr) < plen)) { + struct iphdr *ip = (struct iphdr *)(ptr + hdrlen); + + /* If next header happens to be IP we can skip + * that for the checksum calculation since the + * IP header checksum is zero if correct. + */ + poffset = ip->ihl * 4; + } + + csum = csum_sub(NAPI_GRO_CB(skb)->csum, + csum_partial(ptr + poffset, start - poffset, 0)); + + /* Set derived checksum in packet */ + psum = (__sum16 *)(ptr + offset); + delta = csum_sub(csum_fold(csum), *psum); + *psum = csum_fold(csum); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + skb->remcsum_offload = 1; + + return guehdr; +} + static struct sk_buff **gue_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -174,6 +311,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, struct guehdr *guehdr; size_t len, optlen, hdrlen, off; void *data; + u16 doffset = 0; int flush = 1; off = skb_gro_offset(skb); @@ -201,19 +339,33 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, hdrlen = sizeof(*guehdr) + optlen; - skb_gro_pull(skb, hdrlen); - - /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr, + * this is needed if there is a remote checkcsum offload. + */ skb_gro_postpull_rcsum(skb, guehdr, hdrlen); data = &guehdr[1]; if (guehdr->flags & GUE_FLAG_PRIV) { - data += GUE_LEN_PRIV; + __be32 flags = *(__be32 *)(data + doffset); - /* Process private flags */ + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_gro_remcsum(skb, off, guehdr, + data + doffset, hdrlen, + guehdr->proto_ctype); + if (!guehdr) + goto out; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } + skb_gro_pull(skb, hdrlen); + flush = 0; for (p = *head; p; p = p->next) { -- cgit v1.2.3 From 51f3d02b980a338cd291d2bc7629cdfb2568424b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 5 Nov 2014 16:46:40 -0500 Subject: net: Add and use skb_copy_datagram_msg() helper. This encapsulates all of the skb_copy_datagram_iovec() callers with call argument signature "skb, offset, msghdr->msg_iov, length". When we move to iov_iters in the networking, the iov_iter object will sit in the msghdr. Having a helper like this means there will be less places to touch during that transformation. Based upon descriptions and patch from Al Viro. Signed-off-by: David S. Miller --- drivers/isdn/mISDN/socket.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- include/linux/skbuff.h | 6 ++++++ net/appletalk/ddp.c | 2 +- net/atm/common.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/af_bluetooth.c | 4 ++-- net/bluetooth/hci_sock.c | 2 +- net/caif/caif_socket.c | 2 +- net/core/sock.c | 2 +- net/dccp/proto.c | 2 +- net/ieee802154/dgram.c | 2 +- net/ieee802154/raw.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp.c | 5 ++--- net/ipv4/udp.c | 4 ++-- net/ipv6/datagram.c | 4 ++-- net/ipv6/raw.c | 4 ++-- net/ipv6/udp.c | 4 ++-- net/ipx/af_ipx.c | 3 +-- net/irda/af_irda.c | 2 +- net/iucv/af_iucv.c | 2 +- net/key/af_key.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/llc/af_llc.c | 3 +-- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/nfc/llcp_sock.c | 2 +- net/nfc/rawsock.c | 2 +- net/packet/af_packet.c | 2 +- net/phonet/datagram.c | 2 +- net/phonet/pep.c | 2 +- net/rose/af_rose.c | 2 +- net/rxrpc/ar-recvmsg.c | 2 +- net/sctp/socket.c | 2 +- net/tipc/socket.c | 7 +++---- net/unix/af_unix.c | 6 +++--- net/vmw_vsock/vmci_transport.c | 3 +-- net/x25/af_x25.c | 2 +- 43 files changed, 58 insertions(+), 57 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1be82284cf9d..dcbd8589f0c4 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock, memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 6c9c16d76935..443cbbf5c55f 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb) { total_len = min_t(size_t, total_len, skb->len); - error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len); + error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 74ed34413969..39ec7530ae27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -2639,6 +2640,11 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to, int size); +static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, + struct msghdr *msg, int size) +{ + return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size); +} int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index c00897f65a31..425942db17f6 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr copied = size; msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, offset, msg, copied); if (!err && msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name); diff --git a/net/atm/common.c b/net/atm/common.c index 6a765156a3f6..9cd1ccae9a11 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, msg->msg_flags |= MSG_TRUNC; } - error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + error = skb_copy_datagram_msg(skb, 0, msg, copied); if (error) return error; sock_recv_ts_and_drops(msg, sk, skb); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c35c3f48fc0f..f4f835e19378 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { ax25_digi digi; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 339c74ad4553..0a7cc565f93e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); @@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } chunk = min_t(unsigned int, skb->len, size); - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, 0, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 115f149362ba..29e1ec7189bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 43f750e88e19..fbcd156099fb 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, copylen = len; } - ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen); + ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67b1069..ac56dd06c306 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5ab6627cf370..8e6ae9422a7b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -896,7 +896,7 @@ verify_sock_status: else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index ef2ad8aaef13..fc9193eabd41 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, } /* FIXME: skip headers if necessary ?! */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 9d1f64806f02..73a4d53463de 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9ad4555..21894df66262 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c9804139..736236c3e554 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db3100c23..ee8fa4bf3b73 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39ec0c379545..c239f4740d10 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1833,8 +1833,7 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3f001db72351..df19027f44f3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1281,8 +1281,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2cdc38338be3..5c6996e44b14 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 075a0fb400e7..0cbcf98f2cab 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, } if (skb_csum_unnecessary(skb)) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6ba535b6feb..9b6809232b17 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -424,8 +424,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 313ef4644069..a0c75366c93b 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1805,8 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, - copied); + rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied); if (rc) goto out_free; if (skb->tstamp.tv64) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 92fafd485deb..980bc2670a13 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, copied = size; msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a089b6b91650..057b5647ef92 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN; cskb = skb; - if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, offset, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/key/af_key.c b/net/key/af_key.c index 1847ec4e3930..e5883091a8c6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 369a9822488c..a6cc1fed2b52 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0edb263cc002..2177b960da87 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b704a9356208..c559bcdf4679 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index bb9cbc17d926..af662669f951 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -819,8 +819,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, used = len; if (!(flags & MSG_TRUNC)) { - int rc = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1de72de273e..580b79452bec 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } skb_reset_transport_header(data_skb); - err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1b06a1fcf3e8..7e13f6afcd1f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 51f077a92fa9..83bc785d5855 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, rlen, len); cskb = skb; - if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 11c3544ea546..9d7d2b7ba5e4 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 87d20f48ff06..4cd13d8de44b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 290352c0e6b4..0918bc21eae6 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, copylen = len; } - rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen); + rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 70a547ea5177..44b2123e22b8 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1296,7 +1296,7 @@ copy: else len = skb->len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a85c1a086ae4..9b600c20a7a3 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index e9aaa65c0778..4575485ad1b4 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); + ret = skb_copy_datagram_msg(skb, offset, msg, copy); if (ret < 0) goto copy_error; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 634a2abb5f3a..2120292c842d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, if (copied > len) copied = len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); event = sctp_skb2event(skb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ad8a1a1e2275..591bbfa082a0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1372,8 +1372,7 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg), - m->msg_iov, sz); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz); if (res) goto exit; res = sz; @@ -1473,8 +1472,8 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset, - m->msg_iov, sz_to_copy); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, + m, sz_to_copy); if (res) goto exit; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e96884380732..5eee625d113f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); + err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; @@ -2030,8 +2030,8 @@ again: } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, - msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + msg, chunk)) { if (copied == 0) copied = -EFAULT; break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 9bb63ffec4f2..a57ddef7d5af 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, } /* Place the datagram payload in the user's iovec. */ - err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, - payload_len); + err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len); if (err) goto out; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5ad4418ef093..59e785bfde65 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; -- cgit v1.2.3 From 1744bea1fa382f67263fdd9fee51d603fddb3da6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 4 Nov 2014 15:37:03 -0800 Subject: net: Convert SEQ_START_TOKEN/seq_printf to seq_puts Using a single fixed string is smaller code size than using a format and many string arguments. Reduces overall code size a little. $ size net/ipv4/igmp.o* net/ipv6/mcast.o* net/ipv6/ip6_flowlabel.o* text data bss dec hex filename 34269 7012 14824 56105 db29 net/ipv4/igmp.o.new 34315 7012 14824 56151 db57 net/ipv4/igmp.o.old 30078 7869 13200 51147 c7cb net/ipv6/mcast.o.new 30105 7869 13200 51174 c7e6 net/ipv6/mcast.o.old 11434 3748 8580 23762 5cd2 net/ipv6/ip6_flowlabel.o.new 11491 3748 8580 23819 5d0b net/ipv6/ip6_flowlabel.o.old Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 6 +----- net/ipv6/ip6_flowlabel.c | 3 +-- net/ipv6/mcast.c | 6 +----- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 3f8051358202..1e4adae2bde0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2687,11 +2687,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v) struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "%3s %6s " - "%10s %10s %6s %6s\n", "Idx", - "Device", "MCA", - "SRC", "INC", "EXC"); + seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c14343715049..7221021b2d97 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -770,8 +770,7 @@ static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n", - "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); + seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9648de2b6745..e04f184b783a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2823,11 +2823,7 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "%3s %6s " - "%32s %32s %6s %6s\n", "Idx", - "Device", "Multicast Address", - "Source Address", "INC", "EXC"); + seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", -- cgit v1.2.3 From 4c672e4b42bc8046d63a6eb0a2c6a450a501af32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 5 Nov 2014 20:27:38 +0100 Subject: ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: [] ? skb_put+0x3a/0x3b [] ? add_grhead+0x45/0x8e [] ? add_grec+0x394/0x3d4 [] ? mld_ifc_timer_expire+0x195/0x20d [] ? mld_dad_timer_expire+0x45/0x45 [] ? call_timer_fn.isra.29+0x12/0x68 [] ? run_timer_softirq+0x163/0x182 [] ? __do_softirq+0xe0/0x21d [] ? irq_exit+0x4e/0xd3 [] ? smp_apic_timer_interrupt+0x3b/0x46 [] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Hannes Frederic Sowa Cc: David L Stevens Acked-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 11 +++++------ net/ipv6/mcast.c | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1e4adae2bde0..666cf364df86 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } -#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) - -static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; @@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu; while (1) { skb = alloc_skb(size + hlen + tlen, @@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } skb->priority = TC_PRIO_CONTROL; - igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, @@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_dst_set(skb, &rt->dst); skb->dev = dev; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); skb_reset_network_header(skb); @@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index e04f184b783a..5ce107c8aab3 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1550,7 +1550,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, hdr->daddr = *daddr; } -static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) +static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { struct net_device *dev = idev->dev; struct net *net = dev_net(dev); @@ -1561,13 +1561,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu + hlen + tlen; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - size += hlen + tlen; /* limit our allocations to order-0 page */ size = min_t(int, size, SKB_MAX_ORDER(0, 0)); skb = sock_alloc_send_skb(sk, size, 1, &err); @@ -1576,6 +1576,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) return NULL; skb->priority = TC_PRIO_CONTROL; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { @@ -1690,8 +1692,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) -- cgit v1.2.3 From e1b2cb655060e081e73b384b1fc8b2e978f73467 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 5 Nov 2014 16:49:38 -0800 Subject: fou: Fix typo in returning flags in netlink When filling netlink info, dport is being returned as flags. Fix instances to return correct value. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/sit.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 12055fdbe716..ac8491245e5b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -789,7 +789,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, t->encap.dport) || nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, - t->encap.dport)) + t->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 37096d64730e..40403114f00a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -465,7 +465,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.dport)) + tunnel->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 58e5b4710127..45ad92442cd5 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1714,7 +1714,7 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.dport)) + tunnel->encap.flags)) goto nla_put_failure; return 0; -- cgit v1.2.3 From 59b93b41e7fa71138734a911b11b044340dd16bd Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 5 Nov 2014 15:27:48 -0800 Subject: net: Remove MPLS GSO feature. Device can export MPLS GSO support in dev->mpls_features same way it export vlan features in dev->vlan_features. So it is safe to remove NETIF_F_GSO_MPLS redundant flag. Signed-off-by: Pravin B Shelar --- include/linux/netdev_features.h | 5 +---- include/linux/netdevice.h | 1 - include/linux/skbuff.h | 4 +--- net/core/ethtool.c | 1 - net/ipv4/af_inet.c | 1 - net/ipv4/tcp_offload.c | 1 - net/ipv4/udp_offload.c | 3 +-- net/ipv6/ip6_offload.c | 1 - net/ipv6/udp_offload.c | 3 +-- net/mpls/mpls_gso.c | 3 +-- 10 files changed, 5 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 8c94b07e654a..8e30685affeb 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -47,7 +47,6 @@ enum { NETIF_F_GSO_SIT_BIT, /* ... SIT tunnel with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ - NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, @@ -119,7 +118,6 @@ enum { #define NETIF_F_GSO_SIT __NETIF_F(GSO_SIT) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM) -#define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) @@ -183,7 +181,6 @@ enum { NETIF_F_GSO_IPIP | \ NETIF_F_GSO_SIT | \ NETIF_F_GSO_UDP_TUNNEL | \ - NETIF_F_GSO_UDP_TUNNEL_CSUM | \ - NETIF_F_GSO_MPLS) + NETIF_F_GSO_UDP_TUNNEL_CSUM) #endif /* _LINUX_NETDEV_FEATURES_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4767f546d7c0..90ac95900a11 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3583,7 +3583,6 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); - BUILD_BUG_ON(SKB_GSO_MPLS != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39ec7530ae27..53f4f6c93356 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -372,9 +372,7 @@ enum { SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, - SKB_GSO_MPLS = 1 << 12, - - SKB_GSO_TUNNEL_REMCSUM = 1 << 13, + SKB_GSO_TUNNEL_REMCSUM = 1 << 12, }; #if BITS_PER_LONG > 32 diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 06dfb293e5aa..b0f84f5ddda8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -84,7 +84,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ed2c672c5b01..3a096bb2d596 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1223,7 +1223,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_MPLS | 0))) goto out; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index a1b2a5624f91..9d7930ba8e0f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -94,7 +94,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | - SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 0a5a70d0e84c..d3e537ef6b7f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -207,8 +207,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index e9767079a360..fd76ce938c32 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -79,7 +79,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 637ba2e438b7..b6aa8ed18257 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -46,8 +46,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, SKB_GSO_GRE | SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_MPLS) || + SKB_GSO_SIT) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index e3545f21a099..ca27837974fe 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -34,8 +34,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_MPLS))) + SKB_GSO_IPIP))) goto out; /* Setup inner SKB. */ -- cgit v1.2.3 From cbffccc970394f82e397fccbeb136eeaffb3c552 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Nov 2014 14:39:21 -0800 Subject: net; ipv[46] - Remove 2 unnecessary NETDEBUG OOM messages These messages aren't useful as there's a generic dump_stack() on OOM. Neaten the comment and if test above the OOM by separating the assign in if into an allocation then if test. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 +++----- net/ipv6/ip6_output.c | 10 ++++------ 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bc6471d4abcd..4a929adf2ab7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -662,12 +662,10 @@ slow_path: if (len < left) { len &= ~7; } - /* - * Allocate buffer. - */ - if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); + /* Allocate buffer */ + skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); + if (!skb2) { err = -ENOMEM; goto fail; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e950c250ada..916d2a166a9b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -747,13 +747,11 @@ slow_path: if (len < left) { len &= ~7; } - /* - * Allocate buffer. - */ - if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC)) == NULL) { - NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); + /* Allocate buffer */ + frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC); + if (!frag) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; -- cgit v1.2.3 From 450834977796cc74d1265d7dfe69cf6767537dfc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Nov 2014 15:36:08 -0800 Subject: net: esp: Convert NETDEBUG to pr_info Commit 64ce207306de ("[NET]: Make NETDEBUG pure printk wrappers") originally had these NETDEBUG printks as always emitting. Commit a2a316fd068c ("[NET]: Replace CONFIG_NET_DEBUG with sysctl") added a net_msg_warn sysctl to these NETDEBUG uses. Convert these NETDEBUG uses to normal pr_info calls. This changes the output prefix from "ESP: " to include "IPSec: " for the ipv4 case and "IPv6: " for the ipv6 case. These output lines are now like the other messages in the files. Other miscellanea: Neaten the arithmetic spacing to be consistent with other arithmetic spacing in the files. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 10 +++++----- net/ipv6/esp6.c | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d2bf02e0a678..60173d4d3a0e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -603,12 +603,12 @@ static int esp_init_authenc(struct xfrm_state *x) BUG_ON(!aalg_desc); err = -EINVAL; - if (aalg_desc->uinfo.auth.icv_fullbits/8 != + if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits / 8); goto free_key; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 83fc3a385a26..d21d7b22eebc 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -544,12 +544,12 @@ static int esp_init_authenc(struct xfrm_state *x) BUG_ON(!aalg_desc); err = -EINVAL; - if (aalg_desc->uinfo.auth.icv_fullbits/8 != + if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits / 8); goto free_key; } -- cgit v1.2.3 From 36cbb2452cbafca64dcdd3578047433787900cf0 Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Thu, 6 Nov 2014 10:37:54 -0800 Subject: udp: Increment UDP_MIB_IGNOREDMULTI for arriving unmatched multicasts As NIC multicast filtering isn't perfect, and some platforms are quite content to spew broadcasts, we should not trigger an event for skb:kfree_skb when we do not have a match for such an incoming datagram. We do though want to avoid sweeping the matter under the rug entirely, so increment a suitable statistic. This incorporates feedback from David L. Stevens, Karl Neiss and Eric Dumazet. V3 - use bool per David Miller Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/udp.c | 12 +++++++++--- net/ipv6/proc.c | 1 + net/ipv6/udp.c | 11 ++++++++--- 5 files changed, 20 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df40137f33dd..30f541b32895 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -156,6 +156,7 @@ enum UDP_MIB_RCVBUFERRORS, /* RcvbufErrors */ UDP_MIB_SNDBUFERRORS, /* SndbufErrors */ UDP_MIB_CSUMERRORS, /* InCsumErrors */ + UDP_MIB_IGNOREDMULTI, /* IgnoredMulti */ __UDP_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f0d4eb8b99b9..6513ade8d6dc 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -181,6 +181,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index df19027f44f3..5d0fdca8e965 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1647,7 +1647,8 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct udp_table *udptable) + struct udp_table *udptable, + int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct hlist_nulls_node *node; @@ -1656,6 +1657,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif = skb->dev->ifindex; unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1674,6 +1676,7 @@ start_lookup: dif, hnum)) { if (unlikely(count == ARRAY_SIZE(stack))) { flush_stack(stack, count, skb, ~0); + inner_flushed = true; count = 0; } stack[count++] = sk; @@ -1695,7 +1698,10 @@ start_lookup: if (count) { flush_stack(stack, count, skb, count - 1); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -1781,7 +1787,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); + saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 1752cd0b4882..679253d0af84 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -136,6 +136,7 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9b6809232b17..b756355e9739 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -771,7 +771,7 @@ static void udp6_csum_zero_error(struct sk_buff *skb) */ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, - struct udp_table *udptable) + struct udp_table *udptable, int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); @@ -781,6 +781,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif = inet6_iif(skb); unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & @@ -803,6 +804,7 @@ start_lookup: (uh->check || udp_sk(sk)->no_check6_rx)) { if (unlikely(count == ARRAY_SIZE(stack))) { flush_stack(stack, count, skb, ~0); + inner_flushed = true; count = 0; } stack[count++] = sk; @@ -821,7 +823,10 @@ start_lookup: if (count) { flush_stack(stack, count, skb, count - 1); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -873,7 +878,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, */ if (ipv6_addr_is_multicast(daddr)) return __udp6_lib_mcast_deliver(net, skb, - saddr, daddr, udptable); + saddr, daddr, udptable, proto); /* Unicast */ -- cgit v1.2.3 From 32b5913a931fd753faf3d4e1124b2bc2edb364da Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 7 Nov 2014 21:27:08 +0800 Subject: ipv4: Use standard iovec primitive in raw_probe_proto_opt The function raw_probe_proto_opt tries to extract the first two bytes from the user input in order to seed the IPsec lookup for ICMP packets. In doing so it's processing iovec by hand and overcomplicating things. This patch replaces the manual iovec processing with a call to memcpy_fromiovecend. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/raw.c | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ee8fa4bf3b73..9be905057b24 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -422,48 +422,20 @@ error: static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { - struct iovec *iov; - u8 __user *type = NULL; - u8 __user *code = NULL; - int probed = 0; - unsigned int i; + struct icmphdr icmph; + int err; - if (!msg->msg_iov) + if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; - for (i = 0; i < msg->msg_iovlen; i++) { - iov = &msg->msg_iov[i]; - if (!iov) - continue; - - switch (fl4->flowi4_proto) { - case IPPROTO_ICMP: - /* check if one-byte field is readable or not. */ - if (iov->iov_base && iov->iov_len < 1) - break; - - if (!type) { - type = iov->iov_base; - /* check if code field is readable or not. */ - if (iov->iov_len > 1) - code = type + 1; - } else if (!code) - code = iov->iov_base; - - if (type && code) { - if (get_user(fl4->fl4_icmp_type, type) || - get_user(fl4->fl4_icmp_code, code)) - return -EFAULT; - probed = 1; - } - break; - default: - probed = 1; - break; - } - if (probed) - break; - } + /* We only need the first two bytes. */ + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); + if (err) + return err; + + fl4->fl4_icmp_type = icmph.type; + fl4->fl4_icmp_code = icmph.code; + return 0; } -- cgit v1.2.3 From c008ba5bdc9fa830e1a349b20b0be5a137bdef7a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 7 Nov 2014 21:27:09 +0800 Subject: ipv4: Avoid reading user iov twice after raw_probe_proto_opt Ever since raw_probe_proto_opt was added it had the problem of causing the user iov to be read twice, once during the probe for the protocol header and once again in ip_append_data. This is a potential security problem since it means that whatever we're probing may be invalid. This patch plugs the hole by firstly advancing the iov so we don't read the same spot again, and secondly saving what we read the first time around for use by ip_append_data. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/raw.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9be905057b24..43385a9fa441 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -79,6 +79,16 @@ #include #include #include +#include + +struct raw_frag_vec { + struct iovec *iov; + union { + struct icmphdr icmph; + char c[1]; + } hdr; + int hlen; +}; static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), @@ -420,25 +430,57 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) +static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { - struct icmphdr icmph; int err; if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; /* We only need the first two bytes. */ - err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); + rfv->hlen = 2; + + err = memcpy_fromiovec(rfv->hdr.c, rfv->iov, rfv->hlen); if (err) return err; - fl4->fl4_icmp_type = icmph.type; - fl4->fl4_icmp_code = icmph.code; + fl4->fl4_icmp_type = rfv->hdr.icmph.type; + fl4->fl4_icmp_code = rfv->hdr.icmph.code; return 0; } +static int raw_getfrag(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct raw_frag_vec *rfv = from; + + if (offset < rfv->hlen) { + int copy = min(rfv->hlen - offset, len); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + memcpy(to, rfv->hdr.c + offset, copy); + else + skb->csum = csum_block_add( + skb->csum, + csum_partial_copy_nocheck(rfv->hdr.c + offset, + to, copy, 0), + odd); + + odd = 0; + offset += copy; + to += copy; + len -= copy; + + if (!len) + return 0; + } + + offset -= rfv->hlen; + + return ip_generic_getfrag(rfv->iov, to, offset, len, odd, skb); +} + static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -452,6 +494,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, u8 tos; int err; struct ip_options_data opt_copy; + struct raw_frag_vec rfv; err = -EMSGSIZE; if (len > 0xFFFF) @@ -557,7 +600,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr, saddr, 0, 0); if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl4, msg); + rfv.iov = msg->msg_iov; + rfv.hlen = 0; + + err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } @@ -588,8 +634,8 @@ back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); - err = ip_append_data(sk, &fl4, ip_generic_getfrag, - msg->msg_iov, len, 0, + err = ip_append_data(sk, &fl4, raw_getfrag, + &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); -- cgit v1.2.3 From 3d97379a67486bc481ab5b8f7aa5b7ceb6154a95 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Nov 2014 05:54:27 -0800 Subject: tcp: move sk_mark_napi_id() at the right place sk_mark_napi_id() is used to record for a flow napi id of incoming packets for busypoll sake. We should do this only on established flows, not on listeners. This was 'working' by virtue of the socket cloning, but doing this on SYN packets in unecessary cache line dirtying. Even if we move sk_napi_id in the same cache line than sk_lock, we are working to make SYN processing lockless, so it is desirable to set sk_napi_id only for established flows. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9c7d7621466b..8893598a4124 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1429,6 +1429,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { @@ -1450,6 +1451,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); + sk_mark_napi_id(sk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1661,7 +1663,6 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ace29b60813c..fd8e50b380e7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1293,6 +1293,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, np->rx_dst_cookie) == NULL) { @@ -1322,6 +1323,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); + sk_mark_napi_id(sk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) @@ -1454,7 +1456,6 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); -- cgit v1.2.3 From 2c8c56e15df3d4c2af3d656e44feb18789f75837 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Nov 2014 05:54:28 -0800 Subject: net: introduce SO_INCOMING_CPU Alternative to RPS/RFS is to use hardware support for multiple queues. Then split a set of million of sockets into worker threads, each one using epoll() to manage events on its own socket pool. Ideally, we want one thread per RX/TX queue/cpu, but we have no way to know after accept() or connect() on which queue/cpu a socket is managed. We normally use one cpu per RX queue (IRQ smp_affinity being properly set), so remembering on socket structure which cpu delivered last packet is enough to solve the problem. After accept(), connect(), or even file descriptor passing around processes, applications can use : int cpu; socklen_t len = sizeof(cpu); getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len); And use this information to put the socket into the right silo for optimal performance, as all networking stack should run on the appropriate cpu, without need to send IPI (RPS/RFS). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/avr32/include/uapi/asm/socket.h | 2 ++ arch/cris/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 12 ++++++++++++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 5 +++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/udp.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/ipv6/udp.c | 1 + net/sctp/ulpqueue.c | 5 +++-- 21 files changed, 52 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 3de1394bcab8..e2fe0700b3b4 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -87,4 +87,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 6e6cd159924b..92121b0f5b98 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index ed94e5ed0a23..60f60f5b9b35 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -82,6 +82,8 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index ca2c6e6f31c6..2c6890209ea6 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -80,5 +80,7 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index a1b49bac7951..09a93fb566f6 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -89,4 +89,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 6c9a24b3aefa..e8589819c274 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index a14baa218c76..2e9ee8c55a10 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -98,4 +98,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 6aa3ce1854aa..f3492e8c9f70 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index fe35ceacf0e7..7984a1cab3da 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -79,4 +79,6 @@ #define SO_BPF_EXTENSIONS 0x4029 +#define SO_INCOMING_CPU 0x402A + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index a9c3e2e18c05..3474e4ef166d 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -87,4 +87,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index e031332096d7..8457636c33e1 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -86,4 +86,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 54d9608681b6..4a8003a94163 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -76,6 +76,8 @@ #define SO_BPF_EXTENSIONS 0x0032 +#define SO_INCOMING_CPU 0x0033 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 39acec0cf0b1..c46f6a696849 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -91,4 +91,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 6767d75ecb17..7789b59c0c40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -273,6 +273,7 @@ struct cg_proto; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_rxhash: flow hash received from netif layer + * @sk_incoming_cpu: record cpu processing incoming packets * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab @@ -350,6 +351,12 @@ struct sock { #ifdef CONFIG_RPS __u32 sk_rxhash; #endif + u16 sk_incoming_cpu; + /* 16bit hole + * Warned : sk_incoming_cpu can be set from softirq, + * Do not use this hole without fully understanding possible issues. + */ + __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_napi_id; @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) return sk->sk_backlog_rcv(sk, skb); } +static inline void sk_incoming_cpu_update(struct sock *sk) +{ + sk->sk_incoming_cpu = raw_smp_processor_id(); +} + static inline void sock_rps_record_flow_hash(__u32 hash) { #ifdef CONFIG_RPS diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index ea0796bdcf88..f541ccefd4ac 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -82,4 +82,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index ac56dd06c306..0725cf0cb685 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_max_pacing_rate; break; + case SO_INCOMING_CPU: + v.val = sk->sk_incoming_cpu; + break; + default: return -ENOPROTOOPT; } @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8893598a4124..2c6a955fd5c3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1663,6 +1663,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_incoming_cpu_update(sk); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5d0fdca8e965..d13751685f44 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); } rc = sock_queue_rcv_skb(sk, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fd8e50b380e7..1985b4933a6b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1456,6 +1456,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_incoming_cpu_update(sk); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b756355e9739..d1fe36274906 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!ipv6_addr_any(&sk->sk_v6_daddr)) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); } rc = sock_queue_rcv_skb(sk, skb); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index d49dc2ed30ad..ce469d648ffb 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto out_free; - if (!sctp_ulpevent_is_notification(event)) + if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); - + sk_incoming_cpu_update(sk); + } /* Check if the user wishes to receive this event. */ if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) goto out_free; -- cgit v1.2.3 From ba7a46f16dd29f93303daeb1fee8af316c5a07f4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Nov 2014 10:59:17 -0800 Subject: net: Convert LIMIT_NETDEBUG to net_dbg_ratelimited Use the more common dynamic_debug capable net_dbg_ratelimited and remove the LIMIT_NETDEBUG macro. All messages are still ratelimited. Some KERN_ uses are changed to KERN_DEBUG. This may have some negative impact on messages that were emitted at KERN_INFO that are not not enabled at all unless DEBUG is defined or dynamic_debug is enabled. Even so, these messages are now _not_ emitted by default. This also eliminates the use of the net_msg_warn sysctl "/proc/sys/net/core/warnings". For backward compatibility, the sysctl is not removed, but it has no function. The extern declaration of net_msg_warn is removed from sock.h and made static in net/core/sysctl_net_core.c Miscellanea: o Update the sysctl documentation o Remove the embedded uses of pr_fmt o Coalesce format fragments o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 12 ++++++++---- include/net/sock.h | 7 ------- include/net/udplite.h | 6 +++--- net/core/sysctl_net_core.c | 2 ++ net/core/utils.c | 3 --- net/ipv4/icmp.c | 8 ++++---- net/ipv4/inet_fragment.c | 2 +- net/ipv4/ip_fragment.c | 3 +-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_timer.c | 18 ++++++++++-------- net/ipv4/udp.c | 30 +++++++++++++++--------------- net/ipv6/addrconf.c | 6 ++---- net/ipv6/ah6.c | 7 +++---- net/ipv6/datagram.c | 4 ++-- net/ipv6/esp6.c | 4 ++-- net/ipv6/exthdrs.c | 18 +++++++++--------- net/ipv6/icmp.c | 15 +++++++-------- net/ipv6/mip6.c | 11 ++++++----- net/ipv6/netfilter.c | 2 +- net/ipv6/udp.c | 31 +++++++++++++------------------ net/phonet/af_phonet.c | 9 +++++---- net/phonet/pep-gprs.c | 3 +-- net/phonet/pep.c | 12 ++++++------ 23 files changed, 105 insertions(+), 116 deletions(-) (limited to 'net/ipv4') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 04892b821157..e26c607468a6 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -120,10 +120,14 @@ seconds. warnings -------- -This controls console messages from the networking stack that can occur because -of problems on the network like duplicate address or bad checksums. Normally, -this should be enabled, but if the problem persists the messages can be -disabled. +This sysctl is now unused. + +This was used to control console messages from the networking stack that +occur because of problems on the network like duplicate address or bad +checksums. + +These messages are now emitted at KERN_DEBUG and can generally be enabled +and controlled by the dynamic_debug facility. netdev_budget ------------- diff --git a/include/net/sock.h b/include/net/sock.h index 7789b59c0c40..83a669f83bae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2288,13 +2288,6 @@ bool sk_ns_capable(const struct sock *sk, bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); -/* - * Enable debug/info messages - */ -extern int net_msg_warn; -#define LIMIT_NETDEBUG(fmt, args...) \ - do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0) - extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/include/net/udplite.h b/include/net/udplite.h index 2caadabcd07b..9a28a5179400 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -40,7 +40,7 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets * with a zero checksum field are illegal. */ if (uh->check == 0) { - LIMIT_NETDEBUG(KERN_DEBUG "UDPLite: zeroed checksum field\n"); + net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); return 1; } @@ -52,8 +52,8 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) /* * Coverage length violates RFC 3828: log and discard silently. */ - LIMIT_NETDEBUG(KERN_DEBUG "UDPLite: bad csum coverage %d/%d\n", - cscov, skb->len); + net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", + cscov, skb->len); return 1; } else if (cscov < skb->len) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cf9cd13509a7..f93f092fe226 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int net_msg_warn; /* Unused, but still a sysctl */ + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/net/core/utils.c b/net/core/utils.c index efc76dd9dcd1..7b803884c162 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -33,9 +33,6 @@ #include #include -int net_msg_warn __read_mostly = 1; -EXPORT_SYMBOL(net_msg_warn); - DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); /* * All net warning printk()s should be guarded by this function. diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5882f584910e..36b7bfa609d6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -784,8 +784,8 @@ static void icmp_unreach(struct sk_buff *skb) */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", + &iph->daddr); break; case 2: goto out; @@ -798,8 +798,8 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: Source Route Failed\n", + &iph->daddr); break; default: break; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 19419b60cb37..e7920352646a 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -458,6 +458,6 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, ". Dropping fragment.\n"; if (PTR_ERR(q) == -ENOBUFS) - LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); + net_dbg_ratelimited("%s%s", prefix, msg); } EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 4d964dadd655..e5b6d0ddcb58 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -618,8 +618,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, return 0; out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), - qp); + net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5f979c7f5135..d91436ba17ea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5854,12 +5854,12 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) struct inet_request_sock *ireq = inet_rsk(req); if (family == AF_INET) - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), - &ireq->ir_rmt_addr, port); + net_dbg_ratelimited("drop open request from %pI4/%u\n", + &ireq->ir_rmt_addr, port); #if IS_ENABLED(CONFIG_IPV6) else if (family == AF_INET6) - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), - &ireq->ir_v6_rmt_addr, port); + net_dbg_ratelimited("drop open request from %pI6/%u\n", + &ireq->ir_v6_rmt_addr, port); #endif } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 9b21ae8b2e31..1829c7fbc77e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -374,17 +374,19 @@ void tcp_retransmit_timer(struct sock *sk) */ struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &inet->inet_daddr, - ntohs(inet->inet_dport), inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &inet->inet_daddr, + ntohs(inet->inet_dport), + inet->inet_num, + tp->snd_una, tp->snd_nxt); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &sk->sk_v6_daddr, - ntohs(inet->inet_dport), inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &sk->sk_v6_daddr, + ntohs(inet->inet_dport), + inet->inet_num, + tp->snd_una, tp->snd_nxt); } #endif if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d13751685f44..1b6e9d5fadef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1051,7 +1051,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n")); + net_dbg_ratelimited("cork app bug 2\n"); err = -EINVAL; goto out; } @@ -1133,7 +1133,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n")); + net_dbg_ratelimited("udp cork app bug 3\n"); return -EINVAL; } @@ -1547,8 +1547,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * provided by the application." */ if (up->pcrlen == 0) { /* full coverage was set */ - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); + net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested @@ -1558,8 +1558,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, up->pcrlen); + net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); goto drop; } } @@ -1828,11 +1828,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), - ulen, skb->len, - &daddr, ntohs(uh->dest)); + net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), + ulen, skb->len, + &daddr, ntohs(uh->dest)); goto drop; csum_error: @@ -1840,10 +1840,10 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), - ulen); + net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), + ulen); UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 06e897832a7a..251fcb48b216 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1411,10 +1411,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, if (unlikely(score->addr_type == IPV6_ADDR_ANY || score->addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG(KERN_DEBUG - "ADDRCONF: unspecified / multicast address " - "assigned as unicast address on %s", - dev->name); + net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", + dev->name); continue; } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 6d16eb0e0c7f..8ab1989198f6 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -272,10 +272,9 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) ipv6_rearrange_destopt(iph, exthdr.opth); case NEXTHDR_HOP: if (!zero_out_mutable_opts(exthdr.opth)) { - LIMIT_NETDEBUG( - KERN_WARNING "overrun %sopts\n", - nexthdr == NEXTHDR_HOP ? - "hop" : "dest"); + net_dbg_ratelimited("overrun %sopts\n", + nexthdr == NEXTHDR_HOP ? + "hop" : "dest"); return -EINVAL; } break; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5c6996e44b14..cc1139687fd7 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -893,8 +893,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, break; } default: - LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", - cmsg->cmsg_type); + net_dbg_ratelimited("invalid cmsg type: %d\n", + cmsg->cmsg_type); err = -EINVAL; goto exit_f; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index d21d7b22eebc..d2c2d749b6db 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -286,8 +286,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) err = -EINVAL; padlen = nexthdr[0]; if (padlen + 2 + alen >= elen) { - LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage " - "padlen=%d, elen=%d\n", padlen + 2, elen - alen); + net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n", + padlen + 2, elen - alen); goto out; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 601d896f22d0..a7bbbe45570b 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -184,7 +184,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) int ret; if (opt->dsthao) { - LIMIT_NETDEBUG(KERN_DEBUG "hao duplicated\n"); + net_dbg_ratelimited("hao duplicated\n"); goto discard; } opt->dsthao = opt->dst1; @@ -193,14 +193,14 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); if (hao->length != 16) { - LIMIT_NETDEBUG( - KERN_DEBUG "hao invalid option length = %d\n", hao->length); + net_dbg_ratelimited("hao invalid option length = %d\n", + hao->length); goto discard; } if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { - LIMIT_NETDEBUG( - KERN_DEBUG "hao is not an unicast addr: %pI6\n", &hao->addr); + net_dbg_ratelimited("hao is not an unicast addr: %pI6\n", + &hao->addr); goto discard; } @@ -551,8 +551,8 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); return true; } - LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", - nh[optoff + 1]); + net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n", + nh[optoff + 1]); kfree_skb(skb); return false; } @@ -566,8 +566,8 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { - LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", - nh[optoff+1]); + net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", + nh[optoff+1]); IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS); goto drop; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 62c1037d9e83..092934032077 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -338,7 +338,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, * anycast. */ if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { - LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: acast source\n"); + net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); } @@ -452,7 +452,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: addr_any/mcast source\n"); + net_dbg_ratelimited("icmp6_send: addr_any/mcast source\n"); return; } @@ -460,7 +460,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: no reply to icmp error\n"); + net_dbg_ratelimited("icmp6_send: no reply to icmp error\n"); return; } @@ -509,7 +509,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) len = skb->len - msg.offset; len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); if (len < 0) { - LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); + net_dbg_ratelimited("icmp: len problem\n"); goto out_dst_release; } @@ -706,9 +706,8 @@ static int icmpv6_rcv(struct sk_buff *skb) daddr = &ipv6_hdr(skb)->daddr; if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { - LIMIT_NETDEBUG(KERN_DEBUG - "ICMPv6 checksum failed [%pI6c > %pI6c]\n", - saddr, daddr); + net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n", + saddr, daddr); goto csum_error; } @@ -781,7 +780,7 @@ static int icmpv6_rcv(struct sk_buff *skb) if (type & ICMPV6_INFOMSG_MASK) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); + net_dbg_ratelimited("icmpv6: msg of unknown type\n"); /* * error of unknown type. diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index f61429d391d3..b9779d441b12 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -97,16 +97,17 @@ static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { - LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", - mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); + net_dbg_ratelimited("mip6: MH message too short: %d vs >=%d\n", + mh->ip6mh_hdrlen, + mip6_mh_len(mh->ip6mh_type)); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { - LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", - mh->ip6mh_proto); + net_dbg_ratelimited("mip6: MH invalid payload proto = %d\n", + mh->ip6mh_proto); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + skb_network_header_len(skb)); return -1; @@ -288,7 +289,7 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, * XXX: packet if HAO exists. */ if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { - LIMIT_NETDEBUG(KERN_WARNING "mip6: hao exists already, override\n"); + net_dbg_ratelimited("mip6: hao exists already, override\n"); return offset; } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index d38e6a8d8b9f..398377a9d018 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -36,7 +36,7 @@ int ip6_route_me_harder(struct sk_buff *skb) err = dst->error; if (err) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); - LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); + net_dbg_ratelimited("ip6_route_me_harder: No more route\n"); dst_release(dst); return err; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d1fe36274906..0ba3de4f2368 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -660,15 +660,13 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { if (up->pcrlen == 0) { /* full coverage was set */ - LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: partial coverage" - " %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); + net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { - LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: coverage %d " - "too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, up->pcrlen); + net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); goto drop; } } @@ -761,9 +759,9 @@ static void udp6_csum_zero_error(struct sk_buff *skb) /* RFC 2460 section 8.1 says that we SHOULD log * this error. Well, it is reasonable. */ - LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", - &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), - &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); + net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", + &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), + &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); } /* @@ -931,14 +929,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", - proto == IPPROTO_UDPLITE ? "-Lite" : "", - saddr, - ntohs(uh->source), - ulen, - skb->len, - daddr, - ntohs(uh->dest)); + net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", + proto == IPPROTO_UDPLITE ? "-Lite" : "", + saddr, ntohs(uh->source), + ulen, skb->len, + daddr, ntohs(uh->dest)); goto discard; csum_error: UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); @@ -1290,7 +1285,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); + net_dbg_ratelimited("udp cork app bug 2\n"); err = -EINVAL; goto out; } diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 5a940dbd74a3..32ab87d34828 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -426,16 +426,17 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa)); if (!out_dev) { - LIMIT_NETDEBUG(KERN_WARNING"No Phonet route to %02X\n", - pn_sockaddr_get_addr(&sa)); + net_dbg_ratelimited("No Phonet route to %02X\n", + pn_sockaddr_get_addr(&sa)); goto out; } __skb_push(skb, sizeof(struct phonethdr)); skb->dev = out_dev; if (out_dev == dev) { - LIMIT_NETDEBUG(KERN_ERR"Phonet loop to %02X on %s\n", - pn_sockaddr_get_addr(&sa), dev->name); + net_dbg_ratelimited("Phonet loop to %02X on %s\n", + pn_sockaddr_get_addr(&sa), + dev->name); goto out_dev; } /* Some drivers (e.g. TUN) do not allocate HW header space */ diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index e9a83a637185..fa8237fdc57b 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -203,8 +203,7 @@ static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev) len = skb->len; err = pep_write(sk, skb); if (err) { - LIMIT_NETDEBUG(KERN_WARNING"%s: TX error (%d)\n", - dev->name, err); + net_dbg_ratelimited("%s: TX error (%d)\n", dev->name, err); dev->stats.tx_aborted_errors++; dev->stats.tx_errors++; } else { diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 44b2123e22b8..9cd069dfaf65 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -272,8 +272,8 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) hdr = pnp_hdr(skb); if (hdr->data[0] != PN_PEP_TYPE_COMMON) { - LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n", - (unsigned int)hdr->data[0]); + net_dbg_ratelimited("Phonet unknown PEP type: %u\n", + (unsigned int)hdr->data[0]); return -EOPNOTSUPP; } @@ -304,8 +304,8 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) break; default: - LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP indication: %u\n", - (unsigned int)hdr->data[1]); + net_dbg_ratelimited("Phonet unknown PEP indication: %u\n", + (unsigned int)hdr->data[1]); return -EOPNOTSUPP; } if (wake) @@ -451,8 +451,8 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) break; default: - LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP message: %u\n", - hdr->message_id); + net_dbg_ratelimited("Phonet unknown PEP message: %u\n", + hdr->message_id); err = -EINVAL; } out: -- cgit v1.2.3 From d7480fd3b1738a8eae6a76098b17af318cf9b9cc Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 10 Nov 2014 15:59:36 -0800 Subject: neigh: remove dynamic neigh table registration support Currently there are only three neigh tables in the whole kernel: arp table, ndisc table and decnet neigh table. What's more, we don't support registering multiple tables per family. Therefore we can just make these tables statically built-in. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/neighbour.h | 11 ++- net/core/neighbour.c | 247 ++++++++++++++++++++++-------------------------- net/decnet/dn_neigh.c | 4 +- net/ipv4/arp.c | 2 +- net/ipv6/ndisc.c | 4 +- 5 files changed, 126 insertions(+), 142 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index dedfb188b1a7..eb070b3674a1 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -220,6 +220,13 @@ struct neigh_table { struct pneigh_entry **phash_buckets; }; +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_DN_TABLE = 2, + NEIGH_NR_TABLES, +}; + static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; @@ -240,8 +247,8 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 -void neigh_table_init(struct neigh_table *tbl); -int neigh_table_clear(struct neigh_table *tbl); +void neigh_table_init(int index, struct neigh_table *tbl); +int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index edd04116ecb7..8e38f17288d3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags); static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS static const struct file_operations neigh_stat_seq_fops; #endif @@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops; the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. - - The last lock is neigh_tbl_lock. It is pure SMP lock, protecting - list of neighbour tables. This list is used only in process context, */ -static DEFINE_RWLOCK(neigh_tbl_lock); - static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); @@ -1520,7 +1514,9 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static void neigh_table_init_no_netlink(struct neigh_table *tbl) +static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; + +void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; @@ -1566,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; -} - -void neigh_table_init(struct neigh_table *tbl) -{ - struct neigh_table *tmp; - neigh_table_init_no_netlink(tbl); - write_lock(&neigh_tbl_lock); - for (tmp = neigh_tables; tmp; tmp = tmp->next) { - if (tmp->family == tbl->family) - break; - } - tbl->next = neigh_tables; - neigh_tables = tbl; - write_unlock(&neigh_tbl_lock); - - if (unlikely(tmp)) { - pr_err("Registering multiple tables for family %d\n", - tbl->family); - dump_stack(); - } + neigh_tables[index] = tbl; } EXPORT_SYMBOL(neigh_table_init); -int neigh_table_clear(struct neigh_table *tbl) +int neigh_table_clear(int index, struct neigh_table *tbl) { - struct neigh_table **tp; - + neigh_tables[index] = NULL; /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); @@ -1601,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl) neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); - write_lock(&neigh_tbl_lock); - for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { - if (*tp == tbl) { - *tp = tbl->next; - break; - } - } - write_unlock(&neigh_tbl_lock); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); @@ -1626,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); +static struct neigh_table *neigh_find_table(int family) +{ + struct neigh_table *tbl = NULL; + + switch (family) { + case AF_INET: + tbl = neigh_tables[NEIGH_ARP_TABLE]; + break; + case AF_INET6: + tbl = neigh_tables[NEIGH_ND_TABLE]; + break; + case AF_DECnet: + tbl = neigh_tables[NEIGH_DN_TABLE]; + break; + } + + return tbl; +} + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; + struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; @@ -1652,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) } } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct neighbour *neigh; - - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); - - if (nla_len(dst_attr) < tbl->key_len) - goto out; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out; - } + if (nla_len(dst_attr) < tbl->key_len) + goto out; - if (dev == NULL) - goto out; + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } - neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); - if (neigh == NULL) { - err = -ENOENT; - goto out; - } + if (dev == NULL) + goto out; - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); - neigh_release(neigh); + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; goto out; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); out: return err; @@ -1692,11 +1672,14 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; + struct neighbour *neigh; + void *dst, *lladdr; int err; ASSERT_RTNL(); @@ -1720,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; - struct neighbour *neigh; - void *dst, *lladdr; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; - if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out; - dst = nla_data(tb[NDA_DST]); - lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; + err = 0; + } + goto out; + } - if (ndm->ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; + if (dev == NULL) + goto out; - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm->ndm_flags; - err = 0; - } + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto out; } - if (dev == NULL) + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); goto out; - - neigh = neigh_lookup(tbl, dst, dev); - if (neigh == NULL) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - err = -ENOENT; - goto out; - } - - neigh = __neigh_lookup_errno(tbl, dst, dev); - if (IS_ERR(neigh)) { - err = PTR_ERR(neigh); - goto out; - } - } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { - err = -EEXIST; - neigh_release(neigh); - goto out; - } - - if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - if (ndm->ndm_flags & NTF_USE) { - neigh_event_send(neigh, NULL); - err = 0; - } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); - neigh_release(neigh); - goto out; + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + out: return err; } @@ -1982,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; - int err; + bool found = false; + int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy); @@ -1995,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) } ndtmsg = nlmsg_data(nlh); - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + tbl = neigh_tables[tidx]; + if (!tbl) + continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; - - if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { + found = true; break; + } } - if (tbl == NULL) { - err = -ENOENT; - goto errout_locked; - } + if (!found) + return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and @@ -2118,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) errout_tbl_lock: write_unlock_bh(&tbl->lock); -errout_locked: - read_unlock(&neigh_tbl_lock); errout: return err; } @@ -2134,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; + tbl = neigh_tables[tidx]; + if (!tbl) + continue; + if (tidx < tbl_skip || (family && tbl->family != family)) continue; @@ -2168,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: - read_unlock(&neigh_tbl_lock); cb->args[0] = tidx; cb->args[1] = nidx; @@ -2351,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) int proxy = 0; int err; - read_lock(&neigh_tbl_lock); family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is @@ -2363,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl; - tbl = tbl->next, t++) { + for (t = 0; t < NEIGH_NR_TABLES; t++) { + tbl = neigh_tables[t]; + + if (!tbl) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) @@ -2377,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } - read_unlock(&neigh_tbl_lock); cb->args[0] = t; return skb->len; diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index c8121ceddb9e..7ca7c3143da3 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -591,7 +591,7 @@ static const struct file_operations dn_neigh_seq_fops = { void __init dn_neigh_init(void) { - neigh_table_init(&dn_neigh_table); + neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table); proc_create("decnet_neigh", S_IRUGO, init_net.proc_net, &dn_neigh_seq_fops); } @@ -599,5 +599,5 @@ void __init dn_neigh_init(void) void __exit dn_neigh_cleanup(void) { remove_proc_entry("decnet_neigh", init_net.proc_net); - neigh_table_clear(&dn_neigh_table); + neigh_table_clear(NEIGH_DN_TABLE, &dn_neigh_table); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 16acb59d665e..205e1472aa78 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1292,7 +1292,7 @@ static int arp_proc_init(void); void __init arp_init(void) { - neigh_table_init(&arp_tbl); + neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); arp_proc_init(); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 4cb45c1079a2..2c9f6bf57325 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1763,7 +1763,7 @@ int __init ndisc_init(void) /* * Initialize the neighbour table */ - neigh_table_init(&nd_tbl); + neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, @@ -1796,6 +1796,6 @@ void ndisc_cleanup(void) #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif - neigh_table_clear(&nd_tbl); + neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); } -- cgit v1.2.3 From 4243cdc2c1e5a1375cc8397e8f9b06530f099c60 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Nov 2014 21:59:20 -0800 Subject: udp: Neaten function pointer calls and add braces Standardize function pointer uses. Convert calling style from: (*foo)(args...); to: foo(args...); Other miscellanea: o Add braces around loops with single ifs on multiple lines o Realign arguments around these functions o Invert logic in if to return immediately. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/udp.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1b6e9d5fadef..4a16b9129079 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -144,7 +144,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -152,14 +152,13 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { - if (bitmap) - __set_bit(udp_sk(sk2)->udp_port_hash >> log, - bitmap); - else + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { + if (!bitmap) return 1; + __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } + } return 0; } @@ -168,10 +167,10 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, - struct udp_hslot *hslot2, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -179,7 +178,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -187,11 +186,12 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { res = 1; break; } + } spin_unlock(&hslot2->lock); return res; } @@ -206,8 +206,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -2033,7 +2033,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } else { up->corkflag = 0; lock_sock(sk); - (*push_pending_frames)(sk); + push_pending_frames(sk); release_sock(sk); } break; -- cgit v1.2.3 From a8c5f90fb59a2d3bff0bd29adbb3e39fe0dd52f8 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 12 Nov 2014 11:54:09 -0800 Subject: ip_tunnel: Ops registration for secondary encap (fou, gue) Instead of calling fou and gue functions directly from ip_tunnel use ops for these that were previously registered. This patch adds the logic to add and remove encapsulation operations for ip_tunnel, and modified fou (and gue) to register with ip_tunnels. This patch also addresses a circular dependency between ip_tunnel and fou that was causing link errors when CONFIG_NET_IP_TUNNEL=y and CONFIG_NET_FOU=m. References to fou an gue have been removed from ip_tunnel.c Reported-by: Randy Dunlap Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/fou.h | 25 ++------------ include/net/ip_tunnels.h | 16 +++++++++ net/ipv4/fou.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_tunnel.c | 78 ++++++++++++++++++++++++++++++-------------- 4 files changed, 157 insertions(+), 47 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/fou.h b/include/net/fou.h index 25b26ffcf1df..19b8a0c62a98 100644 --- a/include/net/fou.h +++ b/include/net/fou.h @@ -8,31 +8,12 @@ #include #include +size_t fou_encap_hlen(struct ip_tunnel_encap *e); +static size_t gue_encap_hlen(struct ip_tunnel_encap *e); + int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); -static size_t fou_encap_hlen(struct ip_tunnel_encap *e) -{ - return sizeof(struct udphdr); -} - -static size_t gue_encap_hlen(struct ip_tunnel_encap *e) -{ - size_t len; - bool need_priv = false; - - len = sizeof(struct udphdr) + sizeof(struct guehdr); - - if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { - len += GUE_PLEN_REMCSUM; - need_priv = true; - } - - len += need_priv ? GUE_LEN_PRIV : 0; - - return len; -} - #endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5bc6edeb7143..25a59eb388a6 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -117,6 +117,22 @@ struct ip_tunnel_net { struct hlist_head tunnels[IP_TNL_HASH_SIZE]; }; +struct ip_tunnel_encap_ops { + size_t (*encap_hlen)(struct ip_tunnel_encap *e); + int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4); +}; + +#define MAX_IPTUN_ENCAP_OPS 8 + +extern const struct ip_tunnel_encap_ops __rcu * + iptun_encaps[MAX_IPTUN_ENCAP_OPS]; + +int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, + unsigned int num); +int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, + unsigned int num); + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 740ae099a0d9..fe0907774ce8 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -668,6 +668,30 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +size_t fou_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct udphdr); +} +EXPORT_SYMBOL(fou_encap_hlen); + +size_t gue_encap_hlen(struct ip_tunnel_encap *e) +{ + size_t len; + bool need_priv = false; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + + if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { + len += GUE_PLEN_REMCSUM; + need_priv = true; + } + + len += need_priv ? GUE_LEN_PRIV : 0; + + return len; +} +EXPORT_SYMBOL(gue_encap_hlen); + static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, struct flowi4 *fl4, u8 *protocol, __be16 sport) { @@ -787,6 +811,57 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } EXPORT_SYMBOL(gue_build_header); +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = { + .encap_hlen = fou_encap_hlen, + .build_header = fou_build_header, +}; + +static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = { + .encap_hlen = gue_encap_hlen, + .build_header = gue_build_header, +}; + +static int ip_tunnel_encap_add_fou_ops(void) +{ + int ret; + + ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + if (ret < 0) { + pr_err("can't add fou ops\n"); + return ret; + } + + ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); + if (ret < 0) { + pr_err("can't add gue ops\n"); + ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + return ret; + } + + return 0; +} + +static void ip_tunnel_encap_del_fou_ops(void) +{ + ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); +} + +#else + +static int ip_tunnel_encap_add_fou_ops(void) +{ + return 0; +} + +static int ip_tunnel_encap_del_fou_ops(void) +{ +} + +#endif + static int __init fou_init(void) { int ret; @@ -794,6 +869,14 @@ static int __init fou_init(void) ret = genl_register_family_with_ops(&fou_nl_family, fou_nl_ops); + if (ret < 0) + goto exit; + + ret = ip_tunnel_encap_add_fou_ops(); + if (ret < 0) + genl_unregister_family(&fou_nl_family); + +exit: return ret; } @@ -801,6 +884,8 @@ static void __exit fou_fini(void) { struct fou *fou, *next; + ip_tunnel_encap_del_fou_ops(); + genl_unregister_family(&fou_nl_family); /* Close all the FOU sockets */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c3587e1c8b82..63e745aadab6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -57,10 +57,6 @@ #include #include -#if IS_ENABLED(CONFIG_NET_FOU) -#include -#endif - #if IS_ENABLED(CONFIG_IPV6) #include #include @@ -494,19 +490,50 @@ EXPORT_SYMBOL_GPL(ip_tunnel_rcv); static int ip_encap_hlen(struct ip_tunnel_encap *e) { - switch (e->type) { - case TUNNEL_ENCAP_NONE: + const struct ip_tunnel_encap_ops *ops; + int hlen = -EINVAL; + + if (e->type == TUNNEL_ENCAP_NONE) return 0; -#if IS_ENABLED(CONFIG_NET_FOU) - case TUNNEL_ENCAP_FOU: - return fou_encap_hlen(e); - case TUNNEL_ENCAP_GUE: - return gue_encap_hlen(e); -#endif - default: + + if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; - } + + rcu_read_lock(); + ops = rcu_dereference(iptun_encaps[e->type]); + if (likely(ops && ops->encap_hlen)) + hlen = ops->encap_hlen(e); + rcu_read_unlock(); + + return hlen; +} + +const struct ip_tunnel_encap_ops __rcu * + iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; + +int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, + unsigned int num) +{ + return !cmpxchg((const struct ip_tunnel_encap_ops **) + &iptun_encaps[num], + NULL, ops) ? 0 : -1; } +EXPORT_SYMBOL(ip_tunnel_encap_add_ops); + +int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, + unsigned int num) +{ + int ret; + + ret = (cmpxchg((const struct ip_tunnel_encap_ops **) + &iptun_encaps[num], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(ip_tunnel_encap_del_ops); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap) @@ -534,18 +561,19 @@ EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, u8 *protocol, struct flowi4 *fl4) { - switch (t->encap.type) { - case TUNNEL_ENCAP_NONE: + const struct ip_tunnel_encap_ops *ops; + int ret = -EINVAL; + + if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; -#if IS_ENABLED(CONFIG_NET_FOU) - case TUNNEL_ENCAP_FOU: - return fou_build_header(skb, &t->encap, protocol, fl4); - case TUNNEL_ENCAP_GUE: - return gue_build_header(skb, &t->encap, protocol, fl4); -#endif - default: - return -EINVAL; - } + + rcu_read_lock(); + ops = rcu_dereference(iptun_encaps[t->encap.type]); + if (likely(ops && ops->build_header)) + ret = ops->build_header(skb, &t->encap, protocol, fl4); + rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL(ip_tunnel_encap); -- cgit v1.2.3 From 56768644317c7746cb63f61573fcdc2355885707 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Nov 2014 10:04:16 +0100 Subject: netfilter: fix various sparse warnings net/bridge/br_netfilter.c:870:6: symbol 'br_netfilter_enable' was not declared. Should it be static? no; add include net/ipv4/netfilter/nft_reject_ipv4.c:22:6: symbol 'nft_reject_ipv4_eval' was not declared. Should it be static? yes net/ipv6/netfilter/nf_reject_ipv6.c:16:6: symbol 'nf_send_reset6' was not declared. Should it be static? no; add include net/ipv6/netfilter/nft_reject_ipv6.c:22:6: symbol 'nft_reject_ipv6_eval' was not declared. Should it be static? yes net/netfilter/core.c:33:32: symbol 'nf_ipv6_ops' was not declared. Should it be static? no; add include net/netfilter/xt_DSCP.c:40:57: cast truncates bits from constant value (ffffff03 becomes 3) net/netfilter/xt_DSCP.c:57:59: cast truncates bits from constant value (ffffff03 becomes 3) add __force, 3 is what we want. net/ipv4/netfilter/nf_log_arp.c:77:6: symbol 'nf_log_arp_packet' was not declared. Should it be static? yes net/ipv4/netfilter/nf_reject_ipv4.c:17:6: symbol 'nf_send_reset' was not declared. Should it be static? no; add include Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 1 + net/ipv4/netfilter/nf_log_arp.c | 12 ++++++------ net/ipv4/netfilter/nf_reject_ipv4.c | 1 + net/ipv4/netfilter/nft_reject_ipv4.c | 7 +++---- net/ipv6/netfilter/nf_reject_ipv6.c | 1 + net/ipv6/netfilter/nft_reject_ipv6.c | 7 +++---- net/netfilter/core.c | 1 + net/netfilter/xt_DSCP.c | 6 ++++-- 8 files changed, 20 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 1bada53bb195..f81dc33bf8a1 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "br_private.h" diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 0c8799a0c9e4..d059182c1466 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -75,12 +75,12 @@ static void dump_arp_packet(struct nf_log_buf *m, ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); } -void nf_log_arp_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) +static void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) { struct nf_log_buf *m; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 92b303dbd5fc..cdcb9a5d6786 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -11,6 +11,7 @@ #include #include #include +#include #include /* Send RST reply */ diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ed33299c56d1..d729542bd1b7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -19,9 +19,9 @@ #include #include -void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +static void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -36,7 +36,6 @@ void nft_reject_ipv4_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } -EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); static struct nft_expr_type nft_reject_ipv4_type; static const struct nft_expr_ops nft_reject_ipv4_ops = { diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 20d9defc6c59..87576ff15e1b 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -11,6 +11,7 @@ #include #include #include +#include #include void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 0bc19fa87821..f73285924144 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -19,9 +19,9 @@ #include #include -void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +static void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); @@ -38,7 +38,6 @@ void nft_reject_ipv6_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } -EXPORT_SYMBOL_GPL(nft_reject_ipv6_eval); static struct nft_expr_type nft_reject_ipv6_type; static const struct nft_expr_ops nft_reject_ipv6_ops = { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 024a2e25c8a4..fea9ef566427 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index ae8271652efa..3f83d38c4e5b 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -37,7 +37,8 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) if (!skb_make_writable(skb, sizeof(struct iphdr))) return NF_DROP; - ipv4_change_dsfield(ip_hdr(skb), (__u8)(~XT_DSCP_MASK), + ipv4_change_dsfield(ip_hdr(skb), + (__force __u8)(~XT_DSCP_MASK), dinfo->dscp << XT_DSCP_SHIFT); } @@ -54,7 +55,8 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; - ipv6_change_dsfield(ipv6_hdr(skb), (__u8)(~XT_DSCP_MASK), + ipv6_change_dsfield(ipv6_hdr(skb), + (__force __u8)(~XT_DSCP_MASK), dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; -- cgit v1.2.3 From 882288c05ede954e797baa623062f5ea06663ae1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 13 Nov 2014 12:48:21 +0100 Subject: FOU: Fix no return statement warning for !CONFIG_NET_FOU_IP_TUNNELS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/ipv4/fou.c: In function ‘ip_tunnel_encap_del_fou_ops’: net/ipv4/fou.c:861:1: warning: no return statement in function returning non-void [-Wreturn-type] Fixes: a8c5f90fb5 ("ip_tunnel: Ops registration for secondary encap (fou, gue)") Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fou.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index fe0907774ce8..b0b436b0692c 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -856,7 +856,7 @@ static int ip_tunnel_encap_add_fou_ops(void) return 0; } -static int ip_tunnel_encap_del_fou_ops(void) +static void ip_tunnel_encap_del_fou_ops(void) { } -- cgit v1.2.3 From d649a7a81f3b5bacb1d60abd7529894d8234a666 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Nov 2014 09:45:22 -0800 Subject: tcp: limit GSO packets to half cwnd In DC world, GSO packets initially cooked by tcp_sendmsg() are usually big, as sk_pacing_rate is high. When network is congested, cwnd can be smaller than the GSO packets found in socket write queue. tcp_write_xmit() splits GSO packets using the available cwnd, and we end up sending a single GSO packet, consuming all available cwnd. With GRO aggregation on the receiver, we might handle a single GRO packet, sending back a single ACK. 1) This single ACK might be lost TLP or RTO are forced to attempt a retransmit. 2) This ACK releases a full cwnd, sender sends another big GSO packet, in a ping pong mode. This behavior does not fill the pipes in the best way, because of scheduling artifacts. Make sure we always have at least two GSO packets in flight. This allows us to safely increase GRO efficiency without risking spurious retransmits. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0b88158dd4a7..eb73a1dccf56 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1562,7 +1562,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb) { - u32 in_flight, cwnd; + u32 in_flight, cwnd, halfcwnd; /* Don't be strict about the congestion window for the final FIN. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && @@ -1571,10 +1571,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, in_flight = tcp_packets_in_flight(tp); cwnd = tp->snd_cwnd; - if (in_flight < cwnd) - return (cwnd - in_flight); + if (in_flight >= cwnd) + return 0; - return 0; + /* For better scheduling, ensure we have at least + * 2 GSO packets in flight. + */ + halfcwnd = max(cwnd >> 1, 1U); + return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. -- cgit v1.2.3 From e3e3217029a35c579bf100998b43976d0b1cb8d7 Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Mon, 17 Nov 2014 14:04:29 -0800 Subject: icmp: Remove some spurious dropped packet profile hits from the ICMP path If icmp_rcv() has successfully processed the incoming ICMP datagram, we should use consume_skb() rather than kfree_skb() because a hit on the likes of perf -e skb:kfree_skb is not called-for. Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- include/net/ping.h | 2 +- net/ipv4/icmp.c | 43 ++++++++++++++++++++++++++++--------------- net/ipv4/ping.c | 6 +++--- net/ipv6/icmp.c | 12 ++++++++++-- 4 files changed, 42 insertions(+), 21 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ping.h b/include/net/ping.h index 026479b61a2d..f074060bc5de 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -82,7 +82,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -void ping_rcv(struct sk_buff *skb); +bool ping_rcv(struct sk_buff *skb); #ifdef CONFIG_PROC_FS struct ping_seq_afinfo { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36b7bfa609d6..36f5584d93c5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert); */ struct icmp_control { - void (*handler)(struct sk_buff *skb); + bool (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto) * ICMP_PARAMETERPROB. */ -static void icmp_unreach(struct sk_buff *skb) +static bool icmp_unreach(struct sk_buff *skb) { const struct iphdr *iph; struct icmphdr *icmph; @@ -839,10 +839,10 @@ static void icmp_unreach(struct sk_buff *skb) icmp_socket_deliver(skb, info); out: - return; + return true; out_err: ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - goto out; + return false; } @@ -850,17 +850,20 @@ out_err: * Handle ICMP_REDIRECT. */ -static void icmp_redirect(struct sk_buff *skb) +static bool icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - return; + return false; } - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) { + /* there aught to be a stat */ + return false; + } icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + return true; } /* @@ -875,7 +878,7 @@ static void icmp_redirect(struct sk_buff *skb) * See also WRT handling of options once they are done and working. */ -static void icmp_echo(struct sk_buff *skb) +static bool icmp_echo(struct sk_buff *skb) { struct net *net; @@ -891,6 +894,8 @@ static void icmp_echo(struct sk_buff *skb) icmp_param.head_len = sizeof(struct icmphdr); icmp_reply(&icmp_param, skb); } + /* should there be an ICMP stat for ignored echos? */ + return true; } /* @@ -900,7 +905,7 @@ static void icmp_echo(struct sk_buff *skb) * MUST be accurate to a few minutes. * MUST be updated at least at 15Hz. */ -static void icmp_timestamp(struct sk_buff *skb) +static bool icmp_timestamp(struct sk_buff *skb) { struct timespec tv; struct icmp_bxm icmp_param; @@ -927,15 +932,17 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data_len = 0; icmp_param.head_len = sizeof(struct icmphdr) + 12; icmp_reply(&icmp_param, skb); -out: - return; + return true; + out_err: ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); - goto out; + return false; } -static void icmp_discard(struct sk_buff *skb) +static bool icmp_discard(struct sk_buff *skb) { + /* pretend it was a success */ + return true; } /* @@ -946,6 +953,7 @@ int icmp_rcv(struct sk_buff *skb) struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool success; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -1012,7 +1020,12 @@ int icmp_rcv(struct sk_buff *skb) } } - icmp_pointers[icmph->type].handler(skb); + success = icmp_pointers[icmph->type].handler(skb); + + if (success) { + consume_skb(skb); + return 0; + } drop: kfree_skb(skb); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 736236c3e554..ce2920f5bef3 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -955,7 +955,7 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); * All we need to do is get the socket. */ -void ping_rcv(struct sk_buff *skb) +bool ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); @@ -974,11 +974,11 @@ void ping_rcv(struct sk_buff *skb) pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); sock_put(sk); - return; + return true; } pr_debug("no socket, dropping\n"); - /* We're called from icmp_rcv(). kfree_skb() is done there. */ + return false; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 092934032077..39b3ff97a504 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -679,6 +679,7 @@ static int icmpv6_rcv(struct sk_buff *skb) const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; + bool success = false; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -726,7 +727,7 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - ping_rcv(skb); + success = ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: @@ -790,7 +791,14 @@ static int icmpv6_rcv(struct sk_buff *skb) icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); } - kfree_skb(skb); + /* until the v6 path can be better sorted assume failure and + * preserve the status quo behaviour for the rest of the paths to here + */ + if (success) + consume_skb(skb); + else + kfree_skb(skb); + return 0; csum_error: -- cgit v1.2.3 From 355a901e6cf1b2b763ec85caa2a9f04fbcc4ab4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Nov 2014 23:06:20 -0800 Subject: tcp: make connect() mem charging friendly While working on sk_forward_alloc problems reported by Denys Fedoryshchenko, we found that tcp connect() (and fastopen) do not call sk_wmem_schedule() for SYN packet (and/or SYN/DATA packet), so sk_forward_alloc is negative while connect is in progress. We can fix this by calling regular sk_stream_alloc_skb() both for the SYN packet (in tcp_connect()) and the syn_data packet in tcp_send_syn_data() Then, tcp_send_syn_data() can avoid copying syn_data as we simply can manipulate syn_data->cb[] to remove SYN flag (and increment seq) Instead of open coding memcpy_fromiovecend(), simply use this helper. This leaves in socket write queue clean fast clone skbs. This was tested against our fastopen packetdrill tests. Reported-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 68 +++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 40 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eb73a1dccf56..f5bd4bd3f7e6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3011,9 +3011,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; - struct sk_buff *syn_data = NULL, *data; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; + struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, @@ -3044,48 +3044,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, - sk->sk_allocation); - if (syn_data == NULL) + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + if (!syn_data) goto fallback; + syn_data->ip_summed = CHECKSUM_PARTIAL; + memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); + if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), + fo->data->msg_iov, 0, space))) { + kfree_skb(syn_data); + goto fallback; + } - for (i = 0; i < iovlen && syn_data->len < space; ++i) { - struct iovec *iov = &fo->data->msg_iov[i]; - unsigned char __user *from = iov->iov_base; - int len = iov->iov_len; + /* No more data pending in inet_wait_for_connect() */ + if (space == fo->size) + fo->data = NULL; + fo->copied = space; - if (syn_data->len + len > space) - len = space - syn_data->len; - else if (i + 1 == iovlen) - /* No more data pending in inet_wait_for_connect() */ - fo->data = NULL; + tcp_connect_queue_skb(sk, syn_data); - if (skb_add_data(syn_data, from, len)) - goto fallback; - } + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - /* Queue a data-only packet after the regular SYN for retransmission */ - data = pskb_copy(syn_data, sk->sk_allocation); - if (data == NULL) - goto fallback; - TCP_SKB_CB(data)->seq++; - TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; - TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); - tcp_connect_queue_skb(sk, data); - fo->copied = data->len; - - /* syn_data is about to be sent, we need to take current time stamps - * for the packets that are in write queue : SYN packet and DATA - */ - skb_mstamp_get(&syn->skb_mstamp); - data->skb_mstamp = syn->skb_mstamp; + syn->skb_mstamp = syn_data->skb_mstamp; - if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + /* Now full SYN+DATA was cloned and sent (or not), + * remove the SYN from the original skb (syn_data) + * we keep in write queue in case of a retransmit, as we + * also have the SYN packet (with no data) in the same queue. + */ + TCP_SKB_CB(syn_data)->seq++; + TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; + if (!err) { tp->syn_data = (fo->copied > 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - syn_data = NULL; fallback: /* Send a regular SYN with Fast Open cookie request option */ @@ -3094,7 +3086,6 @@ fallback: err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; - kfree_skb(syn_data); done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; @@ -3114,13 +3105,10 @@ int tcp_connect(struct sock *sk) return 0; } - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); - if (unlikely(buff == NULL)) + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (unlikely(!buff)) return -ENOBUFS; - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); -- cgit v1.2.3 From 62749e2cb3c4a7da3eaa5c01a7e787aebeff8536 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:04:58 +0100 Subject: vlan: rename __vlan_put_tag to vlan_insert_tag_set_proto Name fits better. Plus there's going to be introduced __vlan_insert_tag later on. Signed-off-by: Jiri Pirko Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 ++-- drivers/net/ethernet/emulex/benet/be_main.c | 6 ++++-- drivers/net/vxlan.c | 12 ++++++------ include/linux/if_vlan.h | 8 +++++--- net/bridge/br_vlan.c | 4 ++-- net/core/dev.c | 4 ++-- net/core/netpoll.c | 4 ++-- net/ipv4/geneve.c | 11 +++++------ net/openvswitch/actions.c | 4 +++- net/openvswitch/datapath.c | 3 ++- net/openvswitch/vport-gre.c | 6 +++--- 11 files changed, 36 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e26c68232032..c1d7da427a3e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2146,8 +2146,8 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, netdev_dbg(slave_dev, "inner tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), tags->vlan_id); - skb = __vlan_put_tag(skb, tags->vlan_proto, - tags->vlan_id); + skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto, + tags->vlan_id); if (!skb) { net_err_ratelimited("failed to insert inner VLAN tag\n"); return; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 54160cc62656..d02fbc7694ea 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -887,7 +887,8 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter, } if (vlan_tag) { - skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); + skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q), + vlan_tag); if (unlikely(!skb)) return skb; skb->vlan_tci = 0; @@ -896,7 +897,8 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter, /* Insert the outer VLAN, if any */ if (adapter->qnq_vid) { vlan_tag = adapter->qnq_vid; - skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); + skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q), + vlan_tag); if (unlikely(!skb)) return skb; if (skip_hw_vlan) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 23b1e8c0d547..bb8fbab438e8 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1600,9 +1600,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, return err; if (vlan_tx_tag_present(skb)) { - if (WARN_ON(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (WARN_ON(!skb)) return -ENOMEM; skb->vlan_tci = 0; @@ -1644,9 +1644,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, return err; if (vlan_tx_tag_present(skb)) { - if (WARN_ON(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (WARN_ON(!skb)) return -ENOMEM; skb->vlan_tci = 0; diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 75b70a5e4a6d..46e4a15b9b55 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -320,8 +320,9 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, } /** - * __vlan_put_tag - regular VLAN tag inserting + * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag + * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload @@ -330,8 +331,9 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ -static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, - __be16 vlan_proto, u16 vlan_tci) +static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, + __be16 vlan_proto, + u16 vlan_tci) { skb = vlan_insert_tag(skb, vlan_proto, vlan_tci); if (skb) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 150048fb99b0..97b8ddf57363 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -199,8 +199,8 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, if (skb->vlan_proto != proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (unlikely(!skb)) return false; diff --git a/net/core/dev.c b/net/core/dev.c index 1ab168e0fdf7..3611e60df407 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,8 +2645,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, { if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (skb) skb->vlan_tci = 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e6645b4f330a..65d372384a3f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -79,8 +79,8 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 31802afce34f..fd430a6a1c37 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -132,12 +132,11 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, return err; if (vlan_tx_tag_present(skb)) { - if (unlikely(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) { - err = -ENOMEM; - return err; - } + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (unlikely(!skb) + return -ENOMEM; + skb->vlan_tci = 0; } diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 749a30163071..426b9131ede0 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -287,7 +287,9 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, /* push down current VLAN tag */ current_tag = vlan_tx_tag_get(skb); - if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + current_tag); + if (!skb) return -ENOMEM; /* Update mac_len for subsequent MPLS actions */ skb->mac_len += VLAN_HLEN; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ab141d49bb9d..c63e60e4d947 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -425,7 +425,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (!nskb) return -ENOMEM; - nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb)); + nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, + vlan_tx_tag_get(nskb)); if (!nskb) return -ENOMEM; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 8e61a5c6ae7c..777cd8c71d53 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -176,9 +176,9 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) } if (vlan_tx_tag_present(skb)) { - if (unlikely(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) { + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (unlikely(!skb) { err = -ENOMEM; goto err_free_rt; } -- cgit v1.2.3 From 5968250c868ceee680aa77395b24e6ddcae17d36 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:04:59 +0100 Subject: vlan: introduce *vlan_hwaccel_push_inside helpers Use them to push skb->vlan_tci into the payload and avoid code duplication. Signed-off-by: Jiri Pirko Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 22 ++++++---------------- include/linux/if_vlan.h | 34 ++++++++++++++++++++++++++++++++++ net/core/dev.c | 8 ++------ net/core/netpoll.c | 4 +--- net/ipv4/geneve.c | 11 +++-------- net/openvswitch/datapath.c | 4 +--- net/openvswitch/vport-gre.c | 12 ++++-------- 7 files changed, 51 insertions(+), 44 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bb8fbab438e8..64d45fa3d997 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1599,14 +1599,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, if (unlikely(err)) return err; - if (vlan_tx_tag_present(skb)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (WARN_ON(!skb)) - return -ENOMEM; - - skb->vlan_tci = 0; - } + skb = vlan_hwaccel_push_inside(skb); + if (WARN_ON(!skb)) + return -ENOMEM; vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); @@ -1643,14 +1638,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, if (unlikely(err)) return err; - if (vlan_tx_tag_present(skb)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (WARN_ON(!skb)) - return -ENOMEM; - - skb->vlan_tci = 0; - } + skb = vlan_hwaccel_push_inside(skb); + if (WARN_ON(!skb)) + return -ENOMEM; vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 46e4a15b9b55..291e6706876e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -341,6 +341,40 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, return skb; } +/* + * __vlan_hwaccel_push_inside - pushes vlan tag to the payload + * @skb: skbuff to tag + * + * Pushes the VLAN tag from @skb->vlan_tci inside to the payload. + * + * Following the skb_unshare() example, in case of error, the calling function + * doesn't have to worry about freeing the original skb. + */ +static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) +{ + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (likely(skb)) + skb->vlan_tci = 0; + return skb; +} +/* + * vlan_hwaccel_push_inside - pushes vlan tag to the payload + * @skb: skbuff to tag + * + * Checks is tag is present in @skb->vlan_tci and if it is, it pushes the + * VLAN tag from @skb->vlan_tci inside to the payload. + * + * Following the skb_unshare() example, in case of error, the calling function + * doesn't have to worry about freeing the original skb. + */ +static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb) +{ + if (vlan_tx_tag_present(skb)) + skb = __vlan_hwaccel_push_inside(skb); + return skb; +} + /** * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting * @skb: skbuff to tag diff --git a/net/core/dev.c b/net/core/dev.c index 3611e60df407..ac4836241a96 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2644,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (skb) - skb->vlan_tci = 0; - } + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); return skb; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 65d372384a3f..e0ad5d16c9c5 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this @@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, */ goto out; } - skb->vlan_tci = 0; } status = netdev_start_xmit(skb, dev, txq, false); diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index fd430a6a1c37..a457232f0131 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -131,14 +131,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, if (unlikely(err)) return err; - if (vlan_tx_tag_present(skb)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb) - return -ENOMEM; - - skb->vlan_tci = 0; - } + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) + return -ENOMEM; gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c63e60e4d947..f37ca3e5824c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -425,12 +425,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (!nskb) return -ENOMEM; - nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, - vlan_tx_tag_get(nskb)); + nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; - nskb->vlan_tci = 0; skb = nskb; } diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 777cd8c71d53..6b69df545b1d 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -175,14 +175,10 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_rt; } - if (vlan_tx_tag_present(skb)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb) { - err = -ENOMEM; - goto err_free_rt; - } - skb->vlan_tci = 0; + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) { + err = -ENOMEM; + goto err_free_rt; } /* Push Tunnel header. */ -- cgit v1.2.3 From 227158db160449b6513d2e31894a135104b90e90 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Apr 2014 18:47:38 -0400 Subject: new helper: skb_copy_and_csum_datagram_msg() Signed-off-by: Al Viro --- include/linux/skbuff.h | 5 +++++ net/ipv4/udp.c | 5 ++--- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 78c299f40bac..cbe4b2078b30 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2651,6 +2651,11 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, } int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); +static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, + struct msghdr *msg) +{ + return skb_copy_and_csum_datagram_iovec(skb, hlen, msg->msg_iov); +} int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, const struct iovec *from, int from_offset, int len); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a16b9129079..b2d606833ce4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1284,9 +1284,8 @@ try_again: err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, - sizeof(struct udphdr), - msg->msg_iov); + err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), + msg); if (err == -EINVAL) goto csum_copy_err; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0cbcf98f2cab..8baa53e17a30 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -492,7 +492,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, goto csum_copy_err; err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { - err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); + err = skb_copy_and_csum_datagram_msg(skb, 0, msg); if (err == -EINVAL) goto csum_copy_err; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index dbc0b042bdd6..7cfb5d745a2d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -428,7 +428,7 @@ try_again: err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg); if (err == -EINVAL) goto csum_copy_err; } -- cgit v1.2.3 From 6ce8e9ce5989ae13f493062975304700be86d20e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Apr 2014 21:25:44 -0400 Subject: new helper: memcpy_from_msg() Signed-off-by: Al Viro --- crypto/algif_skcipher.c | 10 +++++----- drivers/isdn/mISDN/socket.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- include/linux/skbuff.h | 5 +++++ include/net/sctp/sm.h | 2 +- net/appletalk/ddp.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/mgmt.c | 2 +- net/bluetooth/rfcomm/sock.c | 2 +- net/bluetooth/sco.c | 2 +- net/caif/caif_socket.c | 4 ++-- net/can/bcm.c | 19 ++++++++----------- net/can/raw.c | 2 +- net/dccp/proto.c | 2 +- net/decnet/af_decnet.c | 2 +- net/ieee802154/dgram.c | 2 +- net/ieee802154/raw.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/irda/af_irda.c | 6 +++--- net/iucv/af_iucv.c | 2 +- net/key/af_key.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ppp.c | 3 +-- net/llc/af_llc.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/nfc/llcp_commands.c | 4 ++-- net/nfc/rawsock.c | 2 +- net/packet/af_packet.c | 5 ++--- net/phonet/datagram.c | 2 +- net/phonet/pep.c | 2 +- net/rose/af_rose.c | 2 +- net/sctp/sm_make_chunk.c | 4 ++-- net/x25/af_x25.c | 2 +- 36 files changed, 57 insertions(+), 57 deletions(-) (limited to 'net/ipv4') diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 83187f497c7c..c3b482bee208 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -298,9 +298,9 @@ static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock, len = min_t(unsigned long, len, PAGE_SIZE - sg->offset - sg->length); - err = memcpy_fromiovec(page_address(sg_page(sg)) + - sg->offset + sg->length, - msg->msg_iov, len); + err = memcpy_from_msg(page_address(sg_page(sg)) + + sg->offset + sg->length, + msg, len); if (err) goto unlock; @@ -337,8 +337,8 @@ static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock, if (!sg_page(sg + i)) goto unlock; - err = memcpy_fromiovec(page_address(sg_page(sg + i)), - msg->msg_iov, plen); + err = memcpy_from_msg(page_address(sg_page(sg + i)), + msg, plen); if (err) { __free_page(sg_page(sg + i)); sg_assign_page(sg + i, NULL); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index dcbd8589f0c4..84b35925ee4d 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -203,7 +203,7 @@ mISDN_sock_sendmsg(struct kiocb *iocb, struct socket *sock, if (!skb) goto done; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto done; } diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 443cbbf5c55f..d2408a5e43a6 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -869,7 +869,7 @@ static int pppoe_sendmsg(struct kiocb *iocb, struct socket *sock, ph = (struct pppoe_hdr *)skb_put(skb, total_len + sizeof(struct pppoe_hdr)); start = (char *)&ph->tag[0]; - error = memcpy_fromiovec(start, m->msg_iov, total_len); + error = memcpy_from_msg(start, m, total_len); if (error < 0) { kfree_skb(skb); goto end; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cbe4b2078b30..97dc5f8123b3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2687,6 +2687,11 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) +{ + return memcpy_fromiovec(data, msg->msg_iov, len); +} + struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 72a31db47ded..487ef34bbd63 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -219,7 +219,7 @@ struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *, const struct sctp_chunk *, __u32 tsn); struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *, - const struct msghdr *, size_t msg_len); + struct msghdr *, size_t msg_len); struct sctp_chunk *sctp_make_abort_violation(const struct sctp_association *, const struct sctp_chunk *, const __u8 *, diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 425942db17f6..0d0766ea5ab1 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1659,7 +1659,7 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len); - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) { kfree_skb(skb); err = -EFAULT; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index f4f835e19378..ca049a7c9287 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1549,7 +1549,7 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, skb_reserve(skb, size - len); /* User data follows immediately after the AX.25 data */ - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; kfree_skb(skb); goto out; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 5e2cd2535978..2c245fdf319a 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -947,7 +947,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, if (!skb) goto done; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto drop; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index cbeef5f62f3b..f3e4a16fb157 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5767,7 +5767,7 @@ int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen) if (!buf) return -ENOMEM; - if (memcpy_fromiovec(buf, msg->msg_iov, msglen)) { + if (memcpy_from_msg(buf, msg, msglen)) { err = -EFAULT; goto done; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 8bbbb5ec468c..2348176401a0 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -588,7 +588,7 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, } skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err) { kfree_skb(skb); if (sent == 0) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 7ee9e4ab00f8..30e5ea3f1ad3 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -285,7 +285,7 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) if (!skb) return err; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); return -EFAULT; } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index fbcd156099fb..230f14026c11 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -566,7 +566,7 @@ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock, skb_reserve(skb, cf_sk->headroom); - ret = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + ret = memcpy_from_msg(skb_put(skb, len), msg, len); if (ret) goto err; @@ -641,7 +641,7 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, */ size = min_t(int, size, skb_tailroom(skb)); - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err) { kfree_skb(skb); goto out_err; diff --git a/net/can/bcm.c b/net/can/bcm.c index dcb75c0e66c1..b9a1f715df18 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -858,8 +858,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* update can_frames content */ for (i = 0; i < msg_head->nframes; i++) { - err = memcpy_fromiovec((u8 *)&op->frames[i], - msg->msg_iov, CFSIZ); + err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); if (op->frames[i].can_dlc > 8) err = -EINVAL; @@ -894,8 +893,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { - err = memcpy_fromiovec((u8 *)&op->frames[i], - msg->msg_iov, CFSIZ); + err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ); if (op->frames[i].can_dlc > 8) err = -EINVAL; @@ -1024,9 +1022,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes) { /* update can_frames content */ - err = memcpy_fromiovec((u8 *)op->frames, - msg->msg_iov, - msg_head->nframes * CFSIZ); + err = memcpy_from_msg((u8 *)op->frames, msg, + msg_head->nframes * CFSIZ); if (err < 0) return err; @@ -1072,8 +1069,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } if (msg_head->nframes) { - err = memcpy_fromiovec((u8 *)op->frames, msg->msg_iov, - msg_head->nframes * CFSIZ); + err = memcpy_from_msg((u8 *)op->frames, msg, + msg_head->nframes * CFSIZ); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); @@ -1209,7 +1206,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) can_skb_reserve(skb); - err = memcpy_fromiovec(skb_put(skb, CFSIZ), msg->msg_iov, CFSIZ); + err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ); if (err < 0) { kfree_skb(skb); return err; @@ -1285,7 +1282,7 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, /* read message head information */ - ret = memcpy_fromiovec((u8 *)&msg_head, msg->msg_iov, MHSIZ); + ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); if (ret < 0) return ret; diff --git a/net/can/raw.c b/net/can/raw.c index 081e81fd017f..0e4004fb6876 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -703,7 +703,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto free_skb; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8e6ae9422a7b..19f038739087 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -781,7 +781,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_release; skb_reserve(skb, sk->sk_prot->max_header); - rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) goto out_discard; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 25733d538147..e2e2e3cb9113 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2032,7 +2032,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER); - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto out; } diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index b8555ec71387..2c7a93e7167e 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -276,7 +276,7 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, if (err < 0) goto out_skb; - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 21c38945ab8b..61e9d2972947 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -150,7 +150,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, skb_reset_mac_header(skb); skb_reset_network_header(skb); - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index ce2920f5bef3..ef8f6ee90473 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -660,7 +660,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ - if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len)) + if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d22a31f27ab4..69de1a1c05c9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4368,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) + if (memcpy_from_msg(skb_put(skb, size), msg, size)) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index e8c409055922..9052462cf42a 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1319,7 +1319,7 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, skb_reserve(skb, self->max_header_size + 16); skb_reset_transport_header(skb); skb_put(skb, len); - err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + err = memcpy_from_msg(skb_transport_header(skb), msg, len); if (err) { kfree_skb(skb); goto out_err; @@ -1569,7 +1569,7 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, pr_debug("%s(), appending user data\n", __func__); skb_put(skb, len); - err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + err = memcpy_from_msg(skb_transport_header(skb), msg, len); if (err) { kfree_skb(skb); goto out; @@ -1678,7 +1678,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, pr_debug("%s(), appending user data\n", __func__); skb_put(skb, len); - err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + err = memcpy_from_msg(skb_transport_header(skb), msg, len); if (err) { kfree_skb(skb); goto out; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 057b5647ef92..1cd3f8107239 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1122,7 +1122,7 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, } if (iucv->transport == AF_IUCV_TRANS_HIPER) skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN); - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto fail; } diff --git a/net/key/af_key.c b/net/key/af_key.c index e5883091a8c6..f8ac939d52b4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3611,7 +3611,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb, goto out; err = -EFAULT; - if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) + if (memcpy_from_msg(skb_put(skb,len), msg, len)) goto out; hdr = pfkey_get_base_msg(skb, &err); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index a6cc1fed2b52..05dfc8aa36af 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -441,7 +441,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m *((__be32 *) skb_put(skb, 4)) = 0; /* Copy user data into skb */ - rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); goto error; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index c559bcdf4679..cc7a828fc914 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -346,8 +346,7 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh skb_put(skb, 2); /* Copy user data into skb */ - error = memcpy_fromiovec(skb_put(skb, total_len), m->msg_iov, - total_len); + error = memcpy_from_msg(skb_put(skb, total_len), m, total_len); if (error < 0) { kfree_skb(skb); goto error_put_sess_tun; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index af662669f951..2c0b83ce43bd 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -921,7 +921,7 @@ static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = llc->dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hdrlen); - rc = memcpy_fromiovec(skb_put(skb, copied), msg->msg_iov, copied); + rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e1aad6eeac14..63aa5c8acf12 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2325,7 +2325,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7e13f6afcd1f..69f1d5e9959f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1113,7 +1113,7 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, skb_put(skb, len); /* User data follows immediately after the NET/ROM transport header */ - if (memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len)) { + if (memcpy_from_msg(skb_transport_header(skb), msg, len)) { kfree_skb(skb); err = -EFAULT; goto out; diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index a3ad69a4c648..c4da0c2d8a14 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -665,7 +665,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, if (msg_data == NULL) return -ENOMEM; - if (memcpy_fromiovec(msg_data, msg->msg_iov, len)) { + if (memcpy_from_msg(msg_data, msg, len)) { kfree(msg_data); return -EFAULT; } @@ -731,7 +731,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, if (msg_data == NULL) return -ENOMEM; - if (memcpy_fromiovec(msg_data, msg->msg_iov, len)) { + if (memcpy_from_msg(msg_data, msg, len)) { kfree(msg_data); return -EFAULT; } diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 9d7d2b7ba5e4..373e138c0ab6 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -231,7 +231,7 @@ static int rawsock_sendmsg(struct kiocb *iocb, struct socket *sock, if (skb == NULL) return rc; - rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); return rc; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 58af58026dd2..07ac95014ecb 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1676,7 +1676,7 @@ retry: if (len < hhlen) skb_reset_network_header(skb); } - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; @@ -2446,8 +2446,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) len -= vnet_hdr_len; - err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, - vnet_hdr_len); + err = memcpy_from_msg((void *)&vnet_hdr, msg, vnet_hdr_len); if (err < 0) goto out_unlock; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 0918bc21eae6..26054b4b467c 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -109,7 +109,7 @@ static int pn_sendmsg(struct kiocb *iocb, struct sock *sk, return err; skb_reserve(skb, MAX_PHONET_HEADER); - err = memcpy_fromiovec((void *)skb_put(skb, len), msg->msg_iov, len); + err = memcpy_from_msg((void *)skb_put(skb, len), msg, len); if (err < 0) { kfree_skb(skb); return err; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 9cd069dfaf65..5d3f2b7507d4 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1141,7 +1141,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, return err; skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned); - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err < 0) goto outfree; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 9b600c20a7a3..43bac7c4dd9e 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1121,7 +1121,7 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, skb_reset_transport_header(skb); skb_put(skb, len); - err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + err = memcpy_from_msg(skb_transport_header(skb), msg, len); if (err) { kfree_skb(skb); return err; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9f32741abb1c..e49bccebb0cc 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1001,7 +1001,7 @@ no_mem: /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, - const struct msghdr *msg, + struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; @@ -1018,7 +1018,7 @@ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, if (!payload) goto err_payload; - err = memcpy_fromiovec(payload, msg->msg_iov, paylen); + err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 59e785bfde65..d9149b68b9bc 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1170,7 +1170,7 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, skb_reset_transport_header(skb); skb_put(skb, len); - rc = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; -- cgit v1.2.3 From 7eab8d9e8a722ca07bc785f73e21c3d3418defa6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Apr 2014 21:51:23 -0400 Subject: new helper: memcpy_to_msg() Signed-off-by: Al Viro --- crypto/algif_hash.c | 2 +- include/linux/skbuff.h | 5 +++++ net/caif/caif_socket.c | 2 +- net/can/bcm.c | 2 +- net/can/raw.c | 2 +- net/decnet/af_decnet.c | 2 +- net/ipv4/tcp.c | 2 +- net/irda/af_irda.c | 2 +- net/packet/af_packet.c | 3 +-- 9 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 850246206b12..35c93ff11f35 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -174,7 +174,7 @@ static int hash_recvmsg(struct kiocb *unused, struct socket *sock, goto unlock; } - err = memcpy_toiovec(msg->msg_iov, ctx->result, len); + err = memcpy_to_msg(msg, ctx->result, len); unlock: release_sock(sk); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97dc5f8123b3..d048347a010a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2692,6 +2692,11 @@ static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) return memcpy_fromiovec(data, msg->msg_iov, len); } +static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) +{ + return memcpy_toiovec(msg->msg_iov, data, len); +} + struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 230f14026c11..ac618b0b8a4f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -418,7 +418,7 @@ unlock: } release_sock(sk); chunk = min_t(unsigned int, skb->len, size); - if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + if (memcpy_to_msg(msg, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) copied = -EFAULT; diff --git a/net/can/bcm.c b/net/can/bcm.c index b9a1f715df18..01671187e3fe 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1555,7 +1555,7 @@ static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb->len < size) size = skb->len; - err = memcpy_toiovec(msg->msg_iov, skb->data, size); + err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; diff --git a/net/can/raw.c b/net/can/raw.c index 0e4004fb6876..dfdcffbb1070 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -750,7 +750,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, else size = skb->len; - err = memcpy_toiovec(msg->msg_iov, skb->data, size); + err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index e2e2e3cb9113..810228646de3 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1760,7 +1760,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, if ((chunk + copied) > size) chunk = size - copied; - if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + if (memcpy_to_msg(msg, skb->data, chunk)) { rv = -EFAULT; break; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c239f4740d10..435443bfc3c3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1349,7 +1349,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) if (len > 0) { if (!(flags & MSG_TRUNC)) - err = memcpy_toiovec(msg->msg_iov, &c, 1); + err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 9052462cf42a..568edc72d737 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1466,7 +1466,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, } chunk = min_t(unsigned int, skb->len, size); - if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + if (memcpy_to_msg(msg, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (copied == 0) copied = -EFAULT; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 07ac95014ecb..108d7f381b87 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2949,8 +2949,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, - vnet_hdr_len); + err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len); if (err < 0) goto out_free; } -- cgit v1.2.3 From 4fd671ded14f92cb8db0bf72747f4df508ba5e3d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 25 Nov 2014 11:21:20 -0800 Subject: gue: Call remcsum_adjust Change remote checksum offload to call remcsum_adjust. This also eliminates the optimization to skip an IP header as part of the adjustment (really does not seem to be much of a win). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/fou.c | 84 ++++++++++++---------------------------------------------- 1 file changed, 17 insertions(+), 67 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 3dfe9828e7ef..b986298a7ba3 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -64,15 +64,13 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) } static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, - void *data, int hdrlen, u8 ipproto) + void *data, size_t hdrlen, u8 ipproto) { __be16 *pd = data; - u16 start = ntohs(pd[0]); - u16 offset = ntohs(pd[1]); - u16 poffset = 0; - u16 plen; - __wsum csum, delta; - __sum16 *psum; + size_t start = ntohs(pd[0]); + size_t offset = ntohs(pd[1]); + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + __wsum delta; if (skb->remcsum_offload) { /* Already processed in GRO path */ @@ -80,35 +78,15 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, return guehdr; } - if (start > skb->len - hdrlen || - offset > skb->len - hdrlen - sizeof(u16)) - return NULL; - - if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) - __skb_checksum_complete(skb); - - plen = hdrlen + offset + sizeof(u16); if (!pskb_may_pull(skb, plen)) return NULL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - if (ipproto == IPPROTO_IP && sizeof(struct iphdr) < plen) { - struct iphdr *ip = (struct iphdr *)(skb->data + hdrlen); - - /* If next header happens to be IP we can skip that for the - * checksum calculation since the IP header checksum is zero - * if correct. - */ - poffset = ip->ihl * 4; - } - - csum = csum_sub(skb->csum, skb_checksum(skb, poffset + hdrlen, - start - poffset - hdrlen, 0)); + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) + __skb_checksum_complete(skb); - /* Set derived checksum in packet */ - psum = (__sum16 *)(skb->data + hdrlen + offset); - delta = csum_sub(csum_fold(csum), *psum); - *psum = csum_fold(csum); + delta = remcsum_adjust((void *)guehdr + hdrlen, + skb->csum, start, offset); /* Adjust skb->csum since we changed the packet */ skb->csum = csum_add(skb->csum, delta); @@ -158,9 +136,6 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); - /* Pull UDP header now, skb->data points to guehdr */ - __skb_pull(skb, sizeof(struct udphdr)); - /* Pull csum through the guehdr now . This can be used if * there is a remote checksum offload. */ @@ -188,7 +163,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); - __skb_pull(skb, hdrlen); + __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); return -guehdr->proto_ctype; @@ -248,24 +223,17 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, size_t hdrlen, u8 ipproto) { __be16 *pd = data; - u16 start = ntohs(pd[0]); - u16 offset = ntohs(pd[1]); - u16 poffset = 0; - u16 plen; - void *ptr; - __wsum csum, delta; - __sum16 *psum; + size_t start = ntohs(pd[0]); + size_t offset = ntohs(pd[1]); + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + __wsum delta; if (skb->remcsum_offload) return guehdr; - if (start > skb_gro_len(skb) - hdrlen || - offset > skb_gro_len(skb) - hdrlen - sizeof(u16) || - !NAPI_GRO_CB(skb)->csum_valid || skb->remcsum_offload) + if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; - plen = hdrlen + offset + sizeof(u16); - /* Pull checksum that will be written */ if (skb_gro_header_hard(skb, off + plen)) { guehdr = skb_gro_header_slow(skb, off + plen, off); @@ -273,26 +241,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return NULL; } - ptr = (void *)guehdr + hdrlen; - - if (ipproto == IPPROTO_IP && - (hdrlen + sizeof(struct iphdr) < plen)) { - struct iphdr *ip = (struct iphdr *)(ptr + hdrlen); - - /* If next header happens to be IP we can skip - * that for the checksum calculation since the - * IP header checksum is zero if correct. - */ - poffset = ip->ihl * 4; - } - - csum = csum_sub(NAPI_GRO_CB(skb)->csum, - csum_partial(ptr + poffset, start - poffset, 0)); - - /* Set derived checksum in packet */ - psum = (__sum16 *)(ptr + offset); - delta = csum_sub(csum_fold(csum), *psum); - *psum = csum_fold(csum); + delta = remcsum_adjust((void *)guehdr + hdrlen, + NAPI_GRO_CB(skb)->csum, start, offset); /* Adjust skb->csum since we changed the packet */ skb->csum = csum_add(skb->csum, delta); -- cgit v1.2.3 From b59eaf9e2871735ea7cc7e3dbf8bf83bddd786b9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 26 Nov 2014 12:46:50 +0100 Subject: netfilter: combine IPv4 and IPv6 nf_nat_redirect code in one module This resolves linking problems with CONFIG_IPV6=n: net/built-in.o: In function `redirect_tg6': xt_REDIRECT.c:(.text+0x6d021): undefined reference to `nf_nat_redirect_ipv6' Reported-by: Andreas Ruprecht Reported-by: Or Gerlitz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_redirect.h | 9 -- include/net/netfilter/ipv6/nf_nat_redirect.h | 8 -- include/net/netfilter/nf_nat_redirect.h | 12 +++ net/ipv4/netfilter/Kconfig | 8 +- net/ipv4/netfilter/Makefile | 1 - net/ipv4/netfilter/nf_nat_redirect_ipv4.c | 82 ----------------- net/ipv4/netfilter/nft_redir_ipv4.c | 2 +- net/ipv6/netfilter/Kconfig | 8 +- net/ipv6/netfilter/Makefile | 1 - net/ipv6/netfilter/nf_nat_redirect_ipv6.c | 75 ---------------- net/ipv6/netfilter/nft_redir_ipv6.c | 2 +- net/netfilter/Kconfig | 10 ++- net/netfilter/Makefile | 1 + net/netfilter/nf_nat_redirect.c | 127 +++++++++++++++++++++++++++ net/netfilter/xt_REDIRECT.c | 3 +- 15 files changed, 153 insertions(+), 196 deletions(-) delete mode 100644 include/net/netfilter/ipv4/nf_nat_redirect.h delete mode 100644 include/net/netfilter/ipv6/nf_nat_redirect.h create mode 100644 include/net/netfilter/nf_nat_redirect.h delete mode 100644 net/ipv4/netfilter/nf_nat_redirect_ipv4.c delete mode 100644 net/ipv6/netfilter/nf_nat_redirect_ipv6.c create mode 100644 net/netfilter/nf_nat_redirect.c (limited to 'net/ipv4') diff --git a/include/net/netfilter/ipv4/nf_nat_redirect.h b/include/net/netfilter/ipv4/nf_nat_redirect.h deleted file mode 100644 index 19e1df3a0a4d..000000000000 --- a/include/net/netfilter/ipv4/nf_nat_redirect.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _NF_NAT_REDIRECT_IPV4_H_ -#define _NF_NAT_REDIRECT_IPV4_H_ - -unsigned int -nf_nat_redirect_ipv4(struct sk_buff *skb, - const struct nf_nat_ipv4_multi_range_compat *mr, - unsigned int hooknum); - -#endif /* _NF_NAT_REDIRECT_IPV4_H_ */ diff --git a/include/net/netfilter/ipv6/nf_nat_redirect.h b/include/net/netfilter/ipv6/nf_nat_redirect.h deleted file mode 100644 index 1ebdffc461cc..000000000000 --- a/include/net/netfilter/ipv6/nf_nat_redirect.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _NF_NAT_REDIRECT_IPV6_H_ -#define _NF_NAT_REDIRECT_IPV6_H_ - -unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, - unsigned int hooknum); - -#endif /* _NF_NAT_REDIRECT_IPV6_H_ */ diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h new file mode 100644 index 000000000000..73b729543309 --- /dev/null +++ b/include/net/netfilter/nf_nat_redirect.h @@ -0,0 +1,12 @@ +#ifndef _NF_NAT_REDIRECT_H_ +#define _NF_NAT_REDIRECT_H_ + +unsigned int +nf_nat_redirect_ipv4(struct sk_buff *skb, + const struct nf_nat_ipv4_multi_range_compat *mr, + unsigned int hooknum); +unsigned int +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + unsigned int hooknum); + +#endif /* _NF_NAT_REDIRECT_H_ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 8358b2da1549..59f883d9cadf 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -104,12 +104,6 @@ config NF_NAT_MASQUERADE_IPV4 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection). -config NF_NAT_REDIRECT_IPV4 - tristate "IPv4 redirect support" - help - This is the kernel functionality to provide NAT in the redirect - flavour (redirect packets to local machine). - config NFT_MASQ_IPV4 tristate "IPv4 masquerading support for nf_tables" depends on NF_TABLES_IPV4 @@ -123,7 +117,7 @@ config NFT_REDIR_IPV4 tristate "IPv4 redirect support for nf_tables" depends on NF_TABLES_IPV4 depends on NFT_REDIR - select NF_NAT_REDIRECT_IPV4 + select NF_NAT_REDIRECT help This is the expression that provides IPv4 redirect support for nf_tables. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 902bcd1597bb..7fe6c703528f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,7 +31,6 @@ obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o -obj-$(CONFIG_NF_NAT_REDIRECT_IPV4) += nf_nat_redirect_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/nf_nat_redirect_ipv4.c b/net/ipv4/netfilter/nf_nat_redirect_ipv4.c deleted file mode 100644 index a220552fc532..000000000000 --- a/net/ipv4/netfilter/nf_nat_redirect_ipv4.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * Copyright (c) 2011 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 - * NAT funded by Astaro. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -unsigned int -nf_nat_redirect_ipv4(struct sk_buff *skb, - const struct nf_nat_ipv4_multi_range_compat *mr, - unsigned int hooknum) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - __be32 newdst; - struct nf_nat_range newrange; - - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || - hooknum == NF_INET_LOCAL_OUT); - - ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - - /* Local packets: make them go to loopback */ - if (hooknum == NF_INET_LOCAL_OUT) { - newdst = htonl(0x7F000001); - } else { - struct in_device *indev; - struct in_ifaddr *ifa; - - newdst = 0; - - rcu_read_lock(); - indev = __in_dev_get_rcu(skb->dev); - if (indev != NULL) { - ifa = indev->ifa_list; - newdst = ifa->ifa_local; - } - rcu_read_unlock(); - - if (!newdst) - return NF_DROP; - } - - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newdst; - newrange.max_addr.ip = newdst; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); -} -EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 643c5967aa27..ff2d23d8c87a 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include static void nft_redir_ipv4_eval(const struct nft_expr *expr, diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 0dbe5c7953e5..a069822936e6 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -82,12 +82,6 @@ config NF_NAT_MASQUERADE_IPV6 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection) for IPv6. -config NF_NAT_REDIRECT_IPV6 - tristate "IPv6 redirect support" - help - This is the kernel functionality to provide NAT in the redirect - flavour (redirect packet to local machine) for IPv6. - config NFT_MASQ_IPV6 tristate "IPv6 masquerade support for nf_tables" depends on NF_TABLES_IPV6 @@ -101,7 +95,7 @@ config NFT_REDIR_IPV6 tristate "IPv6 redirect support for nf_tables" depends on NF_TABLES_IPV6 depends on NFT_REDIR - select NF_NAT_REDIRECT_IPV6 + select NF_NAT_REDIRECT help This is the expression that provides IPv4 redirect support for nf_tables. diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index d2ac9f5f212c..c36e0a5490de 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o -obj-$(CONFIG_NF_NAT_REDIRECT_IPV6) += nf_nat_redirect_ipv6.o # defrag nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o diff --git a/net/ipv6/netfilter/nf_nat_redirect_ipv6.c b/net/ipv6/netfilter/nf_nat_redirect_ipv6.c deleted file mode 100644 index ea1308aeb048..000000000000 --- a/net/ipv6/netfilter/nf_nat_redirect_ipv6.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * Copyright (c) 2011 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 - * NAT funded by Astaro. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; - -unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, - unsigned int hooknum) -{ - struct nf_nat_range newrange; - struct in6_addr newdst; - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (hooknum == NF_INET_LOCAL_OUT) { - newdst = loopback_addr; - } else { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; - bool addr = false; - - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - if (idev != NULL) { - list_for_each_entry(ifa, &idev->addr_list, if_list) { - newdst = ifa->addr; - addr = true; - break; - } - } - rcu_read_unlock(); - - if (!addr) - return NF_DROP; - } - - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = newdst; - newrange.max_addr.in6 = newdst; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); -} -EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index 83420eeaad1c..2433a6bfb191 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include static void nft_redir_ipv6_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 57f15a9aa481..b02660fa9eb0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -411,6 +411,13 @@ config NF_NAT_TFTP depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP +config NF_NAT_REDIRECT + tristate "IPv4/IPv6 redirect support" + depends on NF_NAT + help + This is the kernel functionality to redirect packets to local + machine through NAT. + config NETFILTER_SYNPROXY tristate @@ -844,8 +851,7 @@ config NETFILTER_XT_TARGET_RATEEST config NETFILTER_XT_TARGET_REDIRECT tristate "REDIRECT target support" depends on NF_NAT - select NF_NAT_REDIRECT_IPV4 if NF_NAT_IPV4 - select NF_NAT_REDIRECT_IPV6 if NF_NAT_IPV6 + select NF_NAT_REDIRECT ---help--- REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f3eb4680f2ec..89f73a9e9874 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -51,6 +51,7 @@ nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o obj-$(CONFIG_NF_NAT) += nf_nat.o +obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c new file mode 100644 index 000000000000..97b75f9bfbcd --- /dev/null +++ b/net/netfilter/nf_nat_redirect.c @@ -0,0 +1,127 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * Copyright (c) 2011 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int +nf_nat_redirect_ipv4(struct sk_buff *skb, + const struct nf_nat_ipv4_multi_range_compat *mr, + unsigned int hooknum) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 newdst; + struct nf_nat_range newrange; + + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_OUT); + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (hooknum == NF_INET_LOCAL_OUT) { + newdst = htonl(0x7F000001); + } else { + struct in_device *indev; + struct in_ifaddr *ifa; + + newdst = 0; + + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + if (indev != NULL) { + ifa = indev->ifa_list; + newdst = ifa->ifa_local; + } + rcu_read_unlock(); + + if (!newdst) + return NF_DROP; + } + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newdst; + newrange.max_addr.ip = newdst; + newrange.min_proto = mr->range[0].min; + newrange.max_proto = mr->range[0].max; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} +EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); + +static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; + +unsigned int +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + unsigned int hooknum) +{ + struct nf_nat_range newrange; + struct in6_addr newdst; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (hooknum == NF_INET_LOCAL_OUT) { + newdst = loopback_addr; + } else { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + bool addr = false; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + if (idev != NULL) { + list_for_each_entry(ifa, &idev->addr_list, if_list) { + newdst = ifa->addr; + addr = true; + break; + } + } + rcu_read_unlock(); + + if (!addr) + return NF_DROP; + } + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = newdst; + newrange.max_addr.in6 = newdst; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); +} +EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index b6ec67efd900..03f0b370e178 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -26,8 +26,7 @@ #include #include #include -#include -#include +#include static unsigned int redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) -- cgit v1.2.3 From 6fb2a756739aa507c1fd5b8126f0bfc2f070dc46 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sat, 29 Nov 2014 09:59:45 -0800 Subject: gre: Set inner mac header in gro complete Set the inner mac header to point to the GRE payload when doing GRO. This is needed if we proceed to send the packet through GRE GSO which now uses the inner mac header instead of inner network header to determine the length of encapsulation headers. Fixes: 14051f0452a2 ("gre: Use inner mac length when computing tunnel length") Reported-by: Wolfgang Walter Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index bb5947b0ce2d..51973ddc05a6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -247,6 +247,9 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); rcu_read_unlock(); + + skb_set_inner_mac_header(skb, nhoff + grehlen); + return err; } -- cgit v1.2.3 From 7ce875e5ecb8562fd44040f69bda96c999e38bbc Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Nov 2014 22:22:33 -0500 Subject: ipv4: warn once on passing AF_INET6 socket to ip_recv_error One line change, in response to catching an occurrence of this bug. See also fix f4713a3dfad0 ("net-timestamp: make tcp_recvmsg call ...") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b7826575d215..59eba6c7a512 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -414,6 +414,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int err; int copied; + WARN_ON_ONCE(sk->sk_family == AF_INET6); + err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) -- cgit v1.2.3 From 829ae9d611651467fe6cd7be834bd33ca6b28dfe Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Nov 2014 22:22:34 -0500 Subject: net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 12 +++++++++++- include/uapi/linux/net_tstamp.h | 3 ++- net/ipv4/ip_sockglue.c | 22 ++++++++++++++++++++-- net/ipv6/datagram.c | 21 +++++++++++++++++++-- 4 files changed, 52 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 1d6d02d6ba52..b08e27261ff9 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -122,7 +122,7 @@ SOF_TIMESTAMPING_RAW_HARDWARE: 1.3.3 Timestamp Options -The interface supports one option +The interface supports the options SOF_TIMESTAMPING_OPT_ID: @@ -145,6 +145,16 @@ SOF_TIMESTAMPING_OPT_ID: stream sockets, it increments with every byte. +SOF_TIMESTAMPING_OPT_CMSG: + + Support recv() cmsg for all timestamped packets. Control messages + are already supported unconditionally on all packets with receive + timestamps and on IPv6 packets with transmit timestamp. This option + extends them to IPv4 packets with transmit timestamp. One use case + is to correlate packets with their egress device, by enabling socket + option IP_PKTINFO simultaneously. + + 1.4 Bytestream Timestamps The SO_TIMESTAMPING interface supports timestamping of bytes in a diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index ff354021bb69..edbc888ceb51 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -23,8 +23,9 @@ enum { SOF_TIMESTAMPING_OPT_ID = (1<<7), SOF_TIMESTAMPING_TX_SCHED = (1<<8), SOF_TIMESTAMPING_TX_ACK = (1<<9), + SOF_TIMESTAMPING_OPT_CMSG = (1<<10), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_CMSG, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 59eba6c7a512..640f26c6a9fe 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, + const struct sk_buff *skb, + int ee_origin) +{ + struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + + if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || + (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + (!skb->dev)) + return false; + + info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; + info->ipi_ifindex = skb->dev->ifindex; + return true; +} + /* * Handle MSG_ERRQUEUE */ @@ -446,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; sin->sin_family = AF_UNSPEC; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; @@ -1051,7 +1069,7 @@ e_inval: } /** - * ipv4_pktinfo_prepare - transfert some info from rtable to skb + * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc1139687fd7..2464a00e36ab 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,6 +325,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +{ + int ifindex = skb->dev ? skb->dev->ifindex : -1; + + if (skb->protocol == htons(ETH_P_IPV6)) + IP6CB(skb)->iif = ifindex; + else + PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; +} + /* * Handle MSG_ERRQUEUE */ @@ -388,8 +398,12 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; - if (np->rxopt.all) + if (np->rxopt.all) { + if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && + serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) + ip6_datagram_prepare_pktinfo_errqueue(skb); ip6_datagram_recv_common_ctl(sk, msg, skb); + } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) @@ -491,7 +505,10 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } - put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + + if (src_info.ipi6_ifindex >= 0) + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, + sizeof(src_info), &src_info); } } -- cgit v1.2.3 From 60c04aecd8a72a84869308bdf2289a7aabb9a88c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 1 Dec 2014 20:29:06 -0800 Subject: udp: Neaten and reduce size of compute_score functions The compute_score functions are a bit difficult to read. Neaten them a bit to reduce object sizes and make them a bit more intelligible. Return early to avoid indentation and avoid unnecessary initializations. (allyesconfig, but w/ -O2 and no profiling) $ size net/ipv[46]/udp.o.* text data bss dec hex filename 28680 1184 25 29889 74c1 net/ipv4/udp.o.new 28756 1184 25 29965 750d net/ipv4/udp.o.old 17600 1010 2 18612 48b4 net/ipv6/udp.o.new 17632 1010 2 18644 48d4 net/ipv6/udp.o.old Signed-off-by: Joe Perches Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 111 +++++++++++++++++++++++++++++++------------------------- net/ipv6/udp.c | 113 ++++++++++++++++++++++++++++++++------------------------- 2 files changed, 125 insertions(+), 99 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b2d606833ce4..dd8e00634563 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -336,38 +336,45 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } -static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, - unsigned short hnum, - __be16 sport, __be32 daddr, __be16 dport, int dif) +static inline int compute_score(struct sock *sk, struct net *net, + __be32 saddr, unsigned short hnum, __be16 sport, + __be32 daddr, __be16 dport, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + ipv6_only_sock(sk)) + return -1; - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); + + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 4; + } + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 4; } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 4; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; + } + return score; } @@ -378,33 +385,39 @@ static inline int compute_score2(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + ipv6_only_sock(sk)) + return -1; - if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + inet = inet_sk(sk); - if (inet->inet_rcv_saddr != daddr) + if (inet->inet_rcv_saddr != daddr || + inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET) ? 2 : 1; + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; - if (inet->inet_num != hnum) + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; + score += 4; + } - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + return score; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7cfb5d745a2d..7f96432292ce 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -148,72 +148,85 @@ static inline int compute_score(struct sock *sk, struct net *net, const struct in6_addr *daddr, __be16 dport, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - sk->sk_family == PF_INET6) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6) + return -1; - score = 0; - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) - return -1; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score++; - } + score = 0; + inet = inet_sk(sk); + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; } + + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score++; + } + + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) + return -1; + score++; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + return score; } #define SCORE2_MAX (1 + 1 + 1) static inline int compute_score2(struct sock *sk, struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, unsigned short hnum, - int dif) + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - sk->sk_family == PF_INET6) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6) + return -1; - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + + score = 0; + inet = inet_sk(sk); + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; - score = 0; - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) - return -1; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score++; - } + score++; } + + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) + return -1; + score++; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + return score; } - /* called with read_rcu_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, -- cgit v1.2.3 From 6e3a8a937c2f86ee0b2d354808fc026a143b4518 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Dec 2014 16:13:23 -0800 Subject: tcp_cubic: add SNMP counters to track how effective is Hystart When deploying FQ pacing, one thing we noticed is that CUBIC Hystart triggers too soon. Having SNMP counters to have an idea of how often the various Hystart methods trigger is useful prior to any modifications. This patch adds SNMP counters tracking, how many time "ack train" or "Delay" based Hystart triggers, and cumulative sum of cwnd at the time Hystart decided to end SS (Slow Start) myhost:~# nstat -a | grep Hystart TcpExtTCPHystartTrainDetect 9 0.0 TcpExtTCPHystartTrainCwnd 20650 0.0 TcpExtTCPHystartDelayDetect 10 0.0 TcpExtTCPHystartDelayCwnd 360 0.0 -> Train detection was triggered 9 times, and average cwnd was 20650/9=2294, Delay detection was triggered 10 times and average cwnd was 36 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 4 ++++ net/ipv4/proc.c | 4 ++++ net/ipv4/tcp_cubic.c | 31 ++++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 30f541b32895..b22224100011 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -266,6 +266,10 @@ enum LINUX_MIB_TCPWANTZEROWINDOWADV, /* TCPWantZeroWindowAdv */ LINUX_MIB_TCPSYNRETRANS, /* TCPSynRetrans */ LINUX_MIB_TCPORIGDATASENT, /* TCPOrigDataSent */ + LINUX_MIB_TCPHYSTARTTRAINDETECT, /* TCPHystartTrainDetect */ + LINUX_MIB_TCPHYSTARTTRAINCWND, /* TCPHystartTrainCwnd */ + LINUX_MIB_TCPHYSTARTDELAYDETECT, /* TCPHystartDelayDetect */ + LINUX_MIB_TCPHYSTARTDELAYCWND, /* TCPHystartDelayCwnd */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6513ade8d6dc..8f9cd200ce20 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -288,6 +288,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), + SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), + SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), + SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), + SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 20de0118c98e..c1d07c7ed03d 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -363,16 +363,28 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!(ca->found & hystart_detect)) { + if (ca->found & hystart_detect) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } + } + if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) @@ -381,15 +393,16 @@ static void hystart_update(struct sock *sk, u32 delay) ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + - HYSTART_DELAY_THRESH(ca->delay_min>>4)) + HYSTART_DELAY_THRESH(ca->delay_min>>4)) { ca->found |= HYSTART_DELAY; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } - /* - * Either one of two conditions are met, - * we exit from slow start immediately. - */ - if (ca->found & hystart_detect) - tp->snd_ssthresh = tp->snd_cwnd; } } -- cgit v1.2.3 From 42eef7a0bb0989cd50d74e673422ff98a0ce4d7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Dec 2014 16:13:49 -0800 Subject: tcp_cubic: refine Hystart delay threshold In commit 2b4636a5f8ca ("tcp_cubic: make the delay threshold of HyStart less sensitive"), HYSTART_DELAY_MIN was changed to 4 ms. The remaining problem is that using delay_min + (delay_min/16) as the threshold is too sensitive. 6.25 % of variation is too small for rtt above 60 ms, which are not uncommon. Lets use 12.5 % instead (delay_min + (delay_min/8)) Tested: 80 ms RTT between peers, FQ/pacing packet scheduler on sender. 10 bulk transfers of 10 seconds : nstat >/dev/null for i in `seq 1 10` do netperf -H remote -- -k THROUGHPUT | grep THROUGHPUT done nstat | grep Hystart With the 6.25 % threshold : THROUGHPUT=20.66 THROUGHPUT=249.38 THROUGHPUT=254.10 THROUGHPUT=14.94 THROUGHPUT=251.92 THROUGHPUT=237.73 THROUGHPUT=19.18 THROUGHPUT=252.89 THROUGHPUT=21.32 THROUGHPUT=15.58 TcpExtTCPHystartTrainDetect 2 0.0 TcpExtTCPHystartTrainCwnd 4756 0.0 TcpExtTCPHystartDelayDetect 5 0.0 TcpExtTCPHystartDelayCwnd 180 0.0 With the 12.5 % threshold THROUGHPUT=251.09 THROUGHPUT=247.46 THROUGHPUT=250.92 THROUGHPUT=248.91 THROUGHPUT=250.88 THROUGHPUT=249.84 THROUGHPUT=250.51 THROUGHPUT=254.15 THROUGHPUT=250.62 THROUGHPUT=250.89 TcpExtTCPHystartTrainDetect 1 0.0 TcpExtTCPHystartTrainCwnd 3175 0.0 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c1d07c7ed03d..6b6002416a73 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -393,7 +393,7 @@ static void hystart_update(struct sock *sk, u32 delay) ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + - HYSTART_DELAY_THRESH(ca->delay_min>>4)) { + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found |= HYSTART_DELAY; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); -- cgit v1.2.3 From b61e9dcc5e77d534fa770a02877fd45f51d4e7f4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Nov 2014 10:52:29 -0500 Subject: raw.c: stick msghdr into raw_frag_vec we'll want access to ->msg_iter Signed-off-by: Al Viro --- net/ipv4/raw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 43385a9fa441..5c901ebf90af 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -82,7 +82,7 @@ #include struct raw_frag_vec { - struct iovec *iov; + struct msghdr *msg; union { struct icmphdr icmph; char c[1]; @@ -440,7 +440,7 @@ static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) /* We only need the first two bytes. */ rfv->hlen = 2; - err = memcpy_fromiovec(rfv->hdr.c, rfv->iov, rfv->hlen); + err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); if (err) return err; @@ -478,7 +478,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, offset -= rfv->hlen; - return ip_generic_getfrag(rfv->iov, to, offset, len, odd, skb); + return ip_generic_getfrag(rfv->msg->msg_iov, to, offset, len, odd, skb); } static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, @@ -600,7 +600,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr, saddr, 0, 0); if (!inet->hdrincl) { - rfv.iov = msg->msg_iov; + rfv.msg = msg; rfv.hlen = 0; err = raw_probe_proto_opt(&rfv, &fl4); -- cgit v1.2.3 From f69e6d131f5dac8278ac79a902cc448364880d8b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Nov 2014 13:23:40 -0500 Subject: ip_generic_getfrag, udplite_getfrag: switch to passing msghdr Signed-off-by: Al Viro --- include/net/udplite.h | 3 ++- net/ipv4/ip_output.c | 6 +++--- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 4 ++-- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/udplite.h b/include/net/udplite.h index 9a28a5179400..d5baaba65b46 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -19,7 +19,8 @@ extern struct udp_table udplite_table; static __inline__ int udplite_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - return memcpy_fromiovecend(to, (struct iovec *) from, offset, len); + struct msghdr *msg = from; + return memcpy_fromiovecend(to, msg->msg_iov, offset, len); } /* Designate sk as UDP-Lite socket */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4a929adf2ab7..cdedcf144463 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -752,14 +752,14 @@ EXPORT_SYMBOL(ip_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct iovec *iov = from; + struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (memcpy_fromiovecend(to, iov, offset, len) < 0) + if (memcpy_fromiovecend(to, msg->msg_iov, offset, len) < 0) return -EFAULT; } else { __wsum csum = 0; - if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + if (csum_partial_copy_fromiovecend(to, msg->msg_iov, offset, len, &csum) < 0) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5c901ebf90af..5d83bd2fcedb 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -478,7 +478,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, offset -= rfv->hlen; - return ip_generic_getfrag(rfv->msg->msg_iov, to, offset, len, odd, skb); + return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dd8e00634563..13b4dcf86ef6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1049,7 +1049,7 @@ back_from_confirm: /* Lockless fast path for the non-corking case. */ if (!corkreq) { - skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); @@ -1080,7 +1080,7 @@ back_from_confirm: do_append_data: up->len += ulen; - err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 942f67b91274..11a9283fda51 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -727,7 +727,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, offset -= rfv->hlen; - return ip_generic_getfrag(rfv->msg->msg_iov, to, offset, len, odd, skb); + return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7f96432292ce..189dc4ae3eca 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1312,7 +1312,7 @@ do_append_data: dontfrag = np->dontfrag; up->len += ulen; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; - err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, + err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 2177b960da87..8611f1b63141 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -619,7 +619,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, back_from_confirm: lock_sock(sk); - err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, + err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, msg->msg_flags, dontfrag); -- cgit v1.2.3 From f4362a2c9524678f0459cf410403f8595e5cfce5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Nov 2014 13:26:06 -0500 Subject: switch tcp_sock->ucopy from iovec (ucopy.iov) to msghdr (ucopy.msg) Signed-off-by: Al Viro --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f566b8567892..5d9cc9cd2855 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -162,7 +162,7 @@ struct tcp_sock { struct { struct sk_buff_head prequeue; struct task_struct *task; - struct iovec *iov; + struct msghdr *msg; int memory; int len; } ucopy; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dc13a3657e8e..4a96f3730170 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1729,7 +1729,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { user_recv = current; tp->ucopy.task = user_recv; - tp->ucopy.iov = msg->msg_iov; + tp->ucopy.msg = msg; } tp->ucopy.len = len; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 69de1a1c05c9..075ab4d5af5e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4421,7 +4421,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __set_current_state(TASK_RUNNING); local_bh_enable(); - if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; eaten = (chunk == skb->len); @@ -4941,10 +4941,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) local_bh_enable(); if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); else - err = skb_copy_and_csum_datagram_iovec(skb, hlen, - tp->ucopy.iov); + err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); if (!err) { tp->ucopy.len -= chunk; -- cgit v1.2.3 From c0371da6047abd261bc483c744dbc7d81a116172 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Nov 2014 10:42:55 -0500 Subject: put iov_iter into msghdr Note that the code _using_ ->msg_iter at that point will be very unhappy with anything other than unshifted iovec-backed iov_iter. We still need to convert users to proper primitives. Signed-off-by: Al Viro --- crypto/algif_hash.c | 4 ++-- crypto/algif_skcipher.c | 4 ++-- drivers/net/macvtap.c | 8 ++------ drivers/net/tun.c | 8 ++------ drivers/vhost/net.c | 8 +++----- fs/afs/rxrpc.c | 14 ++++++-------- include/linux/skbuff.h | 16 ++++++++++------ include/linux/socket.h | 3 +-- include/net/bluetooth/l2cap.h | 2 +- include/net/udplite.h | 3 ++- net/atm/common.c | 5 +---- net/bluetooth/6lowpan.c | 6 +++--- net/bluetooth/a2mp.c | 3 +-- net/bluetooth/smp.c | 3 +-- net/caif/caif_socket.c | 2 +- net/compat.c | 10 ++++++---- net/ipv4/ip_output.c | 6 ++++-- net/ipv4/ping.c | 3 ++- net/ipv4/raw.c | 3 ++- net/ipv4/tcp.c | 6 +++--- net/ipv4/tcp_output.c | 2 +- net/ipv6/ping.c | 3 ++- net/ipv6/raw.c | 3 ++- net/netlink/af_netlink.c | 2 +- net/packet/af_packet.c | 7 ++----- net/rds/recv.c | 7 ++++--- net/rds/send.c | 4 +--- net/rxrpc/ar-output.c | 8 +++----- net/sctp/socket.c | 5 +---- net/socket.c | 27 ++++++++++++--------------- net/tipc/msg.c | 4 ++-- net/unix/af_unix.c | 10 ++-------- net/vmw_vsock/vmci_transport.c | 3 ++- 33 files changed, 90 insertions(+), 112 deletions(-) (limited to 'net/ipv4') diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 35c93ff11f35..83cd2cc49c9f 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -42,7 +42,7 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock, struct alg_sock *ask = alg_sk(sk); struct hash_ctx *ctx = ask->private; unsigned long iovlen; - struct iovec *iov; + const struct iovec *iov; long copied = 0; int err; @@ -58,7 +58,7 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock, ctx->more = 0; - for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0; + for (iov = msg->msg_iter.iov, iovlen = msg->msg_iter.nr_segs; iovlen > 0; iovlen--, iov++) { unsigned long seglen = iov->iov_len; char __user *from = iov->iov_base; diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index c3b482bee208..4f45dab24648 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -429,13 +429,13 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock, struct skcipher_sg_list *sgl; struct scatterlist *sg; unsigned long iovlen; - struct iovec *iov; + const struct iovec *iov; int err = -EAGAIN; int used; long copied = 0; lock_sock(sk); - for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0; + for (iov = msg->msg_iter.iov, iovlen = msg->msg_iter.nr_segs; iovlen > 0; iovlen--, iov++) { unsigned long seglen = iov->iov_len; char __user *from = iov->iov_base; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index ba1e5db2152e..2c157cced81f 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -1095,9 +1095,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len) { struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); - struct iov_iter from; - iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, total_len); - return macvtap_get_user(q, m, &from, m->msg_flags & MSG_DONTWAIT); + return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); } static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, @@ -1105,12 +1103,10 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, int flags) { struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); - struct iov_iter to; int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) return -EINVAL; - iov_iter_init(&to, READ, m->msg_iov, m->msg_iovlen, total_len); - ret = macvtap_do_read(q, &to, flags & MSG_DONTWAIT); + ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9c58286b8a42..f3e992ed87ac 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1449,13 +1449,11 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, int ret; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = __tun_get(tfile); - struct iov_iter from; if (!tun) return -EBADFD; - iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, total_len); - ret = tun_get_user(tun, tfile, m->msg_control, &from, + ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); tun_put(tun); return ret; @@ -1467,7 +1465,6 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = __tun_get(tfile); - struct iov_iter to; int ret; if (!tun) @@ -1482,8 +1479,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - iov_iter_init(&to, READ, m->msg_iov, m->msg_iovlen, total_len); - ret = tun_do_read(tun, tfile, &to, flags & MSG_DONTWAIT); + ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 8dae2f724a35..9f06e70a2631 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -342,7 +342,6 @@ static void handle_tx(struct vhost_net *net) .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, - .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; @@ -396,8 +395,8 @@ static void handle_tx(struct vhost_net *net) } /* Skip header. TODO: support TSO. */ s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out); - msg.msg_iovlen = out; len = iov_length(vq->iov, out); + iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for TX: " @@ -562,7 +561,6 @@ static void handle_rx(struct vhost_net *net) .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, - .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr_mrg_rxbuf hdr = { @@ -600,7 +598,7 @@ static void handle_rx(struct vhost_net *net) break; /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { - msg.msg_iovlen = 1; + iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); err = sock->ops->recvmsg(NULL, sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); @@ -626,7 +624,7 @@ static void handle_rx(struct vhost_net *net) /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: * needed because recvmsg can modify msg_iov. */ copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in); - msg.msg_iovlen = in; + iov_iter_init(&msg.msg_iter, READ, vq->iov, in, sock_len); err = sock->ops->recvmsg(NULL, sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 03a3beb17004..06e14bfb3496 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, _debug("- range %u-%u%s", offset, to, msg->msg_flags ? " [more]" : ""); - msg->msg_iov = (struct iovec *) iov; - msg->msg_iovlen = 1; + iov_iter_init(&msg->msg_iter, WRITE, + (struct iovec *) iov, 1, to - offset); /* have to change the state *before* sending the last * packet as RxRPC might give us the reply before it @@ -384,8 +384,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = 1; + iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1, + call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = (call->send_pages ? MSG_MORE : 0); @@ -778,8 +778,7 @@ void afs_send_empty_reply(struct afs_call *call) iov[0].iov_len = 0; msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = iov; - msg.msg_iovlen = 0; + iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */ msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -815,8 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = iov; - msg.msg_iovlen = 1; + iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ef64cec42804..52cf1bdac0d8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2646,22 +2646,24 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to, int size); +int skb_copy_datagram_iter(const struct sk_buff *from, int offset, + struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, struct msghdr *msg, int size) { - return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size); + /* XXX: stripping const */ + return skb_copy_datagram_iovec(from, offset, (struct iovec *)msg->msg_iter.iov, size); } int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg) { - return skb_copy_and_csum_datagram_iovec(skb, hlen, msg->msg_iov); + /* XXX: stripping const */ + return skb_copy_and_csum_datagram_iovec(skb, hlen, (struct iovec *)msg->msg_iter.iov); } int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); -int skb_copy_datagram_iter(const struct sk_buff *from, int offset, - struct iov_iter *to, int size); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); @@ -2689,12 +2691,14 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { - return memcpy_fromiovec(data, msg->msg_iov, len); + /* XXX: stripping const */ + return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len); } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) { - return memcpy_toiovec(msg->msg_iov, data, len); + /* XXX: stripping const */ + return memcpy_toiovec((struct iovec *)msg->msg_iter.iov, data, len); } struct skb_checksum_ops { diff --git a/include/linux/socket.h b/include/linux/socket.h index de5222832be4..048d6d6eed6d 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -47,8 +47,7 @@ struct linger { struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ - struct iovec *msg_iov; /* scatter/gather array */ - __kernel_size_t msg_iovlen; /* # elements in msg_iov */ + struct iov_iter msg_iter; /* data */ void *msg_control; /* ancillary data */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 4e23674d3649..bca6fc0a3196 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -911,7 +911,7 @@ static inline int l2cap_chan_no_memcpy_fromiovec(struct l2cap_chan *chan, /* Following is safe since for compiler definitions of kvec and * iovec are identical, yielding the same in-core layout and alignment */ - struct kvec *vec = (struct kvec *)msg->msg_iov; + struct kvec *vec = (struct kvec *)msg->msg_iter.iov; while (len > 0) { if (vec->iov_len) { diff --git a/include/net/udplite.h b/include/net/udplite.h index d5baaba65b46..ae7c8d1fbcad 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -20,7 +20,8 @@ static __inline__ int udplite_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; - return memcpy_fromiovecend(to, msg->msg_iov, offset, len); + /* XXX: stripping const */ + return memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len); } /* Designate sk as UDP-Lite socket */ diff --git a/net/atm/common.c b/net/atm/common.c index f59112944c91..b84057e41bd6 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -577,9 +577,6 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, struct atm_vcc *vcc; struct sk_buff *skb; int eff, error; - struct iov_iter from; - - iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, size); lock_sock(sk); if (sock->state != SS_CONNECTED) { @@ -634,7 +631,7 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, goto out; skb->dev = NULL; /* for paths shared with net_device interfaces */ ATM_SKB(skb)->atm_options = vcc->atm_options; - if (copy_from_iter(skb_put(skb, size), size, &from) != size) { + if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) { kfree_skb(skb); error = -EFAULT; goto out; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index bdcaefd2db12..d8c67a5e7a02 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -537,12 +537,12 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, */ chan->data = skb; - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = (struct iovec *) &iv; - msg.msg_iovlen = 1; iv.iov_base = skb->data; iv.iov_len = skb->len; + memset(&msg, 0, sizeof(msg)); + iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *) &iv, 1, skb->len); + err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { netdev->stats.tx_bytes += err; diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 5dcade511fdb..716d2a388858 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -60,8 +60,7 @@ void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data) memset(&msg, 0, sizeof(msg)); - msg.msg_iov = (struct iovec *) &iv; - msg.msg_iovlen = 1; + iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&iv, 1, total_len); l2cap_chan_send(chan, &msg, total_len); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 069b76e03b57..21f555b4df17 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -268,8 +268,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) memset(&msg, 0, sizeof(msg)); - msg.msg_iov = (struct iovec *) &iv; - msg.msg_iovlen = 2; + iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index ac618b0b8a4f..769b185fefbd 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -535,7 +535,7 @@ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock, goto err; ret = -EINVAL; - if (unlikely(msg->msg_iov->iov_base == NULL)) + if (unlikely(msg->msg_iter.iov->iov_base == NULL)) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; diff --git a/net/compat.c b/net/compat.c index 062f157d2a6b..3236b4167a32 100644 --- a/net/compat.c +++ b/net/compat.c @@ -37,13 +37,14 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, struct iovec **iov) { compat_uptr_t uaddr, uiov, tmp3; + compat_size_t nr_segs; ssize_t err; if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || __get_user(uaddr, &umsg->msg_name) || __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || __get_user(uiov, &umsg->msg_iov) || - __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) || + __get_user(nr_segs, &umsg->msg_iovlen) || __get_user(tmp3, &umsg->msg_control) || __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) @@ -68,14 +69,15 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (kmsg->msg_iovlen > UIO_MAXIOV) + if (nr_segs > UIO_MAXIOV) return -EMSGSIZE; err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE, - compat_ptr(uiov), kmsg->msg_iovlen, + compat_ptr(uiov), nr_segs, UIO_FASTIOV, *iov, iov); if (err >= 0) - kmsg->msg_iov = *iov; + iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, + *iov, nr_segs, err); return err; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cdedcf144463..b50861b22b6b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -755,11 +755,13 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (memcpy_fromiovecend(to, msg->msg_iov, offset, len) < 0) + /* XXX: stripping const */ + if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0) return -EFAULT; } else { __wsum csum = 0; - if (csum_partial_copy_fromiovecend(to, msg->msg_iov, offset, len, &csum) < 0) + /* XXX: stripping const */ + if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8dd4ae0424fc..c0d82f78d364 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -811,7 +811,8 @@ back_from_confirm: pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; - pfh.iov = msg->msg_iov; + /* XXX: stripping const */ + pfh.iov = (struct iovec *)msg->msg_iter.iov; pfh.wcheck = 0; pfh.family = AF_INET; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5d83bd2fcedb..0bb68df5055d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -625,7 +625,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, back_from_confirm: if (inet->hdrincl) - err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + /* XXX: stripping const */ + err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len, &rt, msg->msg_flags); else { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4a96f3730170..54ba6209eea9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1085,7 +1085,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { - struct iovec *iov; + const struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags, err, copied = 0; @@ -1136,8 +1136,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ - iovlen = msg->msg_iovlen; - iov = msg->msg_iov; + iovlen = msg->msg_iter.nr_segs; + iov = msg->msg_iter.iov; copied = 0; err = -EPIPE; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f5bd4bd3f7e6..3e225b03eb95 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3050,7 +3050,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), - fo->data->msg_iov, 0, space))) { + fo->data->msg_iter.iov, 0, space))) { kfree_skb(syn_data); goto fallback; } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 5b7a1ed2aba9..2d3148378a1f 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -163,7 +163,8 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; - pfh.iov = msg->msg_iov; + /* XXX: stripping const */ + pfh.iov = (struct iovec *)msg->msg_iter.iov; pfh.wcheck = 0; pfh.family = AF_INET6; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 11a9283fda51..ee25631f8c29 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -886,7 +886,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, back_from_confirm: if (inet->hdrincl) - err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags); + /* XXX: stripping const */ + err = rawv6_send_hdrinc(sk, (struct iovec *)msg->msg_iter.iov, len, &fl6, &dst, msg->msg_flags); else { lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 63aa5c8acf12..cc9bcf008b03 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2305,7 +2305,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, } if (netlink_tx_is_mmaped(sk) && - msg->msg_iov->iov_base == NULL) { + msg->msg_iter.iov->iov_base == NULL) { err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb); goto out; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index efa844501136..ed2e620b61df 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2408,11 +2408,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) unsigned short gso_type = 0; int hlen, tlen; int extra_len = 0; - struct iov_iter from; ssize_t n; - iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len); - /* * Get and verify the address. */ @@ -2451,7 +2448,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) len -= vnet_hdr_len; err = -EFAULT; - n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &from); + n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter); if (n != vnet_hdr_len) goto out_unlock; @@ -2522,7 +2519,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } /* Returns -EFAULT on error */ - err = skb_copy_datagram_from_iter(skb, offset, &from, len); + err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; diff --git a/net/rds/recv.c b/net/rds/recv.c index 47d7b1029b33..f9ec1acd801c 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -404,7 +404,6 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; - struct iov_iter to; /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ timeo = sock_rcvtimeo(sk, nonblock); @@ -415,6 +414,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; while (1) { + struct iov_iter save; /* If there are pending notifications, do those - and nothing else */ if (!list_empty(&rs->rs_notify_queue)) { ret = rds_notify_queue_get(rs, msg); @@ -450,8 +450,8 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rdsdebug("copying inc %p from %pI4:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); - iov_iter_init(&to, READ, msg->msg_iov, msg->msg_iovlen, size); - ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &to); + save = msg->msg_iter; + ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); if (ret < 0) break; @@ -464,6 +464,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_inc_put(inc); inc = NULL; rds_stats_inc(s_recv_deliver_raced); + msg->msg_iter = save; continue; } diff --git a/net/rds/send.c b/net/rds/send.c index 4de62ead1c71..40a5629a0a13 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -934,9 +934,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); - struct iov_iter from; - iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, payload_len); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { @@ -984,7 +982,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, ret = -ENOMEM; goto out; } - ret = rds_message_copy_from_user(rm, &from); + ret = rds_message_copy_from_user(rm, &msg->msg_iter); if (ret) goto out; } diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 0b4b9a79f5ab..86e0f10aa2c0 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -531,14 +531,12 @@ static int rxrpc_send_data(struct kiocb *iocb, struct rxrpc_skb_priv *sp; unsigned char __user *from; struct sk_buff *skb; - struct iovec *iov; + const struct iovec *iov; struct sock *sk = &rx->sk; long timeo; bool more; int ret, ioc, segment, copied; - _enter(",,,{%zu},%zu", msg->msg_iovlen, len); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ @@ -547,8 +545,8 @@ static int rxrpc_send_data(struct kiocb *iocb, if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) return -EPIPE; - iov = msg->msg_iov; - ioc = msg->msg_iovlen - 1; + iov = msg->msg_iter.iov; + ioc = msg->msg_iter.nr_segs - 1; from = iov->iov_base; segment = iov->iov_len; iov++; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0397ac9fd98c..c92f96cda699 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1609,9 +1609,6 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, __u16 sinfo_flags = 0; long timeo; int err; - struct iov_iter from; - - iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, msg_len); err = 0; sp = sctp_sk(sk); @@ -1950,7 +1947,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, } /* Break the message into multiple chunks of maximum size. */ - datamsg = sctp_datamsg_from_user(asoc, sinfo, &from); + datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); if (IS_ERR(datamsg)) { err = PTR_ERR(datamsg); goto out_free; diff --git a/net/socket.c b/net/socket.c index f676ac4a3701..8809afccf7fa 100644 --- a/net/socket.c +++ b/net/socket.c @@ -689,8 +689,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, * the following is safe, since for compiler definitions of kvec and * iovec are identical, yielding the same in-core layout and alignment */ - msg->msg_iov = (struct iovec *)vec; - msg->msg_iovlen = num; + iov_iter_init(&msg->msg_iter, WRITE, (struct iovec *)vec, num, size); result = sock_sendmsg(sock, msg, size); set_fs(oldfs); return result; @@ -853,7 +852,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, * the following is safe, since for compiler definitions of kvec and * iovec are identical, yielding the same in-core layout and alignment */ - msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num; + iov_iter_init(&msg->msg_iter, READ, (struct iovec *)vec, num, size); result = sock_recvmsg(sock, msg, size, flags); set_fs(oldfs); return result; @@ -913,8 +912,7 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, msg->msg_namelen = 0; msg->msg_control = NULL; msg->msg_controllen = 0; - msg->msg_iov = (struct iovec *)iov; - msg->msg_iovlen = nr_segs; + iov_iter_init(&msg->msg_iter, READ, iov, nr_segs, size); msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); @@ -953,8 +951,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, msg->msg_namelen = 0; msg->msg_control = NULL; msg->msg_controllen = 0; - msg->msg_iov = (struct iovec *)iov; - msg->msg_iovlen = nr_segs; + iov_iter_init(&msg->msg_iter, WRITE, iov, nr_segs, size); msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; if (sock->type == SOCK_SEQPACKET) msg->msg_flags |= MSG_EOR; @@ -1798,8 +1795,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, iov.iov_base = buff; iov.iov_len = len; msg.msg_name = NULL; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; + iov_iter_init(&msg.msg_iter, WRITE, &iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; @@ -1856,10 +1852,9 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_iovlen = 1; - msg.msg_iov = &iov; iov.iov_len = size; iov.iov_base = ubuf; + iov_iter_init(&msg.msg_iter, READ, &iov, 1, size); /* Save some cycles and don't copy the address if not needed */ msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ @@ -1993,13 +1988,14 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, { struct sockaddr __user *uaddr; struct iovec __user *uiov; + size_t nr_segs; ssize_t err; if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || __get_user(uaddr, &umsg->msg_name) || __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || __get_user(uiov, &umsg->msg_iov) || - __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) || + __get_user(nr_segs, &umsg->msg_iovlen) || __get_user(kmsg->msg_control, &umsg->msg_control) || __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) @@ -2029,14 +2025,15 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (kmsg->msg_iovlen > UIO_MAXIOV) + if (nr_segs > UIO_MAXIOV) return -EMSGSIZE; err = rw_copy_check_uvector(save_addr ? READ : WRITE, - uiov, kmsg->msg_iovlen, + uiov, nr_segs, UIO_FASTIOV, *iov, iov); if (err >= 0) - kmsg->msg_iov = *iov; + iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, + *iov, nr_segs, err); return err; } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 5b0659791c07..a687b30a699c 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -194,7 +194,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, __skb_queue_tail(list, skb); skb_copy_to_linear_data(skb, mhdr, mhsz); pktpos = skb->data + mhsz; - if (!dsz || !memcpy_fromiovecend(pktpos, m->msg_iov, offset, + if (!dsz || !memcpy_fromiovecend(pktpos, m->msg_iter.iov, offset, dsz)) return dsz; rc = -EFAULT; @@ -224,7 +224,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, if (drem < pktrem) pktrem = drem; - if (memcpy_fromiovecend(pktpos, m->msg_iov, offset, pktrem)) { + if (memcpy_fromiovecend(pktpos, m->msg_iter.iov, offset, pktrem)) { rc = -EFAULT; goto error; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4450d6226602..8e1b10274b02 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1459,9 +1459,6 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct scm_cookie tmp_scm; int max_level; int data_len = 0; - struct iov_iter from; - - iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len); if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1519,7 +1516,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; - err = skb_copy_datagram_from_iter(skb, 0, &from, len); + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; @@ -1641,9 +1638,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, bool fds_sent = false; int max_level; int data_len; - struct iov_iter from; - - iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len); if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1700,7 +1694,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; - err = skb_copy_datagram_from_iter(skb, 0, &from, size); + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) { kfree_skb(skb); goto out_err; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 20a0ba3bfff6..02d2e5229240 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1850,7 +1850,8 @@ static ssize_t vmci_transport_stream_enqueue( struct msghdr *msg, size_t len) { - return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0); + /* XXX: stripping const */ + return vmci_qpair_enquev(vmci_trans(vsk)->qpair, (struct iovec *)msg->msg_iter.iov, len, 0); } static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk) -- cgit v1.2.3 From 605ad7f184b60cfaacbc038aa6c55ee68dee3c89 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 7 Dec 2014 12:22:18 -0800 Subject: tcp: refine TSO autosizing Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to control TSO size, but did this at the wrong place (sendmsg() time) At sendmsg() time, we might have a pessimistic view of flow rate, and we end up building very small skbs (with 2 MSS per skb). This is bad because : - It sends small TSO packets even in Slow Start where rate quickly increases. - It tends to make socket write queue very big, increasing tcp_ack() processing time, but also increasing memory needs, not necessarily accounted for, as fast clones overhead is currently ignored. - Lower GRO efficiency and more ACK packets. Servers with a lot of small lived connections suffer from this. Lets instead fill skbs as much as possible (64KB of payload), but split them at xmit time, when we have a precise idea of the flow rate. skb split is actually quite efficient. Patch looks bigger than necessary, because TCP Small Queue decision now has to take place after the eventual split. As Neal suggested, introduce a new tcp_tso_autosize() helper, so that tcp_tso_should_defer() can be synchronized on same goal. Rename tp->xmit_size_goal_segs to tp->gso_segs, as this variable contains number of mss that we can put in GSO packet, and is not related to the autosizing goal anymore. Tested: 40 ms rtt link nstat >/dev/null netperf -H remote -l -2000000 -- -s 1000000 nstat | egrep "IpInReceives|IpOutRequests|TcpOutSegs|IpExtOutOctets" Before patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/s 87380 2000000 2000000 0.36 44.22 IpInReceives 600 0.0 IpOutRequests 599 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2033249 0.0 After patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 2000000 2000000 0.36 44.27 IpInReceives 221 0.0 IpOutRequests 232 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2013953 0.0 Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 60 ++++++++++++++++++--------------------------------- net/ipv4/tcp_output.c | 59 ++++++++++++++++++++++++++++++++++---------------- 3 files changed, 63 insertions(+), 58 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f566b8567892..3fa0a9669a3a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -130,7 +130,7 @@ struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ - u16 xmit_size_goal_segs; /* Goal for segmenting output packets */ + u16 gso_segs; /* Max number of segs per GSO packet */ /* * Header prediction flags diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dc13a3657e8e..427aee33ffc0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; - - xmit_size_goal = mss_now; - - if (large_allowed && sk_can_gso(sk)) { - u32 gso_size, hlen; - - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); - gso_size = max_t(u32, gso_size, - sysctl_tcp_min_tso_segs * mss_now); - - xmit_size_goal = min_t(u32, gso_size, - sk->sk_gso_max_size - 1 - hlen); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + u32 new_size_goal, size_goal, hlen; + + if (!large_allowed || !sk_can_gso(sk)) + return mss_now; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + new_size_goal = sk->sk_gso_max_size - 1 - hlen; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f5bd4bd3f7e6..f37ecf53ee8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } + +/* Return how many segs we'd like on a TSO packet, + * to send one TSO packet per ms + */ +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +{ + u32 bytes, segs; + + bytes = min(sk->sk_pacing_rate >> 10, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + + return min_t(u32, segs, sk->sk_gso_max_segs); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited) + bool *is_cwnd_limited, u32 max_segs) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - tp->xmit_size_goal_segs * tp->mss_cache)) + if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int cwnd_quota; int result; bool is_cwnd_limited = false; + u32 max_segs; sent_pkts = 0; @@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } + max_segs = tcp_tso_autosize(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } else { if (!push_one && - tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + tcp_tso_should_defer(sk, skb, &is_cwnd_limited, + max_segs)) break; } + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, + max_segs), + nonagle); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * This allows for : @@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - limit = mss_now; - if (tso_segs > 1 && !tcp_urg_mode(tp)) - limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - sk->sk_gso_max_segs), - nonagle); - - if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) - break; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -- cgit v1.2.3 From 0f85feae6b710ced3abad5b2b47d31dfcb956b62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Dec 2014 09:56:08 -0800 Subject: tcp: fix more NULL deref after prequeue changes When I cooked commit c3658e8d0f1 ("tcp: fix possible NULL dereference in tcp_vX_send_reset()") I missed other spots we could deref a NULL skb_dst(skb) Again, if a socket is provided, we do not need skb_dst() to get a pointer to network namespace : sock_net(sk) is good enough. Reported-by: Dann Frazier Bisected-by: Dann Frazier Tested-by: Dann Frazier Signed-off-by: Eric Dumazet Fixes: ca777eff51f7 ("tcp: remove dst refcount false sharing for prequeue mode") Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 147be2024290..ef7089ca86e2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -623,6 +623,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG hash_location = tcp_parse_md5sig_option(th); if (!sk && hash_location) { @@ -633,7 +634,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); @@ -681,7 +682,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index dc495ae2ead0..c277951d783b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -787,16 +787,16 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; -static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - u32 tsval, u32 tsecr, int oif, - struct tcp_md5sig_key *key, int rst, u8 tclass, - u32 label) +static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, + u32 ack, u32 win, u32 tsval, u32 tsecr, + int oif, struct tcp_md5sig_key *key, int rst, + u8 tclass, u32 label) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; struct sk_buff *buff; struct flowi6 fl6; - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); struct dst_entry *dst; @@ -946,7 +946,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) (th->doff << 2); oif = sk ? sk->sk_bound_dev_if : 0; - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG release_sk1: @@ -957,13 +957,13 @@ release_sk1: #endif } -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 tsval, u32 tsecr, int oif, +static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq, + u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, u32 label) { - tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, - label); + tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, + tclass, label); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -971,7 +971,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -986,10 +986,10 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, - req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, + tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); } -- cgit v1.2.3 From f95b414edb18de59940dcebbefb49cf25c6d505c Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 11 Dec 2014 11:22:04 +0800 Subject: net: introduce helper macro for_each_cmsghdr Introduce helper macro for_each_cmsghdr as a wrapper of the enumerating cmsghdr from msghdr, just cleanup. Signed-off-by: Gu Zheng Signed-off-by: David S. Miller --- crypto/af_alg.c | 2 +- include/linux/socket.h | 4 ++++ net/core/scm.c | 3 +-- net/dccp/proto.c | 5 ++--- net/ipv4/ip_sockglue.c | 2 +- net/ipv6/datagram.c | 2 +- net/iucv/af_iucv.c | 4 +--- net/rds/send.c | 4 ++-- net/rxrpc/ar-output.c | 2 +- net/sctp/socket.c | 3 +-- 10 files changed, 15 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 6a3ad8011585..bc21f520d489 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -399,7 +399,7 @@ int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con) { struct cmsghdr *cmsg; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_ALG) diff --git a/include/linux/socket.h b/include/linux/socket.h index 048d6d6eed6d..6e49a14365dc 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -103,6 +103,10 @@ struct cmsghdr { (cmsg)->cmsg_len <= (unsigned long) \ ((mhdr)->msg_controllen - \ ((char *)(cmsg) - (char *)(mhdr)->msg_control))) +#define for_each_cmsghdr(cmsg, msg) \ + for (cmsg = CMSG_FIRSTHDR(msg); \ + cmsg; \ + cmsg = CMSG_NXTHDR(msg, cmsg)) /* * Get the next cmsg header diff --git a/net/core/scm.c b/net/core/scm.c index b442e7e25e60..3b6899b7d810 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) struct cmsghdr *cmsg; int err; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) - { + for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 19f038739087..e171b780b499 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -703,7 +703,7 @@ EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { - struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + struct cmsghdr *cmsg; /* * Assign an (opaque) qpolicy priority value to skb->priority. @@ -717,8 +717,7 @@ static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) */ skb->priority = 0; - for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { - + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 640f26c6a9fe..8a89c738b7a3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -192,7 +192,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, int err, val; struct cmsghdr *cmsg; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2464a00e36ab..100c589a2a6c 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -657,7 +657,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, int len; int err = 0; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { int addr_type; if (!CMSG_OK(msg, cmsg)) { diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 1cd3f8107239..2e9953b2db84 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1070,9 +1070,7 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, txmsg.class = 0; /* iterate over control messages */ - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; - cmsg = CMSG_NXTHDR(msg, cmsg)) { - + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; goto out; diff --git a/net/rds/send.c b/net/rds/send.c index 40a5629a0a13..42f65d4305c8 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -826,7 +826,7 @@ static int rds_rm_size(struct msghdr *msg, int data_len) int cmsg_groups = 0; int retval; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; @@ -878,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg; int ret = 0; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 86e0f10aa2c0..e1a9373e5979 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -45,7 +45,7 @@ static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, if (msg->msg_controllen == 0) return -EINVAL; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c92f96cda699..2625eccb77d5 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6592,8 +6592,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) struct cmsghdr *cmsg; struct msghdr *my_msg = (struct msghdr *)msg; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; - cmsg = CMSG_NXTHDR(my_msg, cmsg)) { + for_each_cmsghdr(cmsg, my_msg) { if (!CMSG_OK(my_msg, cmsg)) return -EINVAL; -- cgit v1.2.3