summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-03-01 23:54:31 +0300
committerDavid S. Miller <davem@davemloft.net>2016-03-01 23:54:31 +0300
commitd2e42a1756501ce3466e538da742f9a67c233d47 (patch)
tree2530b68faffa4a6cb301fb71c64cab0ee69f3747
parent46d5efa9a6abe2345472bff93a02480b308d3141 (diff)
parent163e529200af7b7521fbde5dbcc653cf3ce597df (diff)
downloadlinux-d2e42a1756501ce3466e538da742f9a67c233d47.tar.xz
Merge branch 'ndo_set_rx_headroom'
Paolo Abeni says: ==================== bridge/ovs: avoid skb head copy on frame forwarding Currently, while when an OVS or Linux bridge is used to forward frames towards some tunnel device, a skb_head_copy() may occur if the ingress device do not provide enough headroom for the tx encapsulation. This patch series tries to address the issue implementing a new ndo operation to allow the master device to control the headroom used when allocating the skb on frame reception. Said operation is used by the Linux bridge to notify the bridged ports of needed_headroom changes, and similar bookkeeping and behaviour is also added to openvswitch, on a per datapath basis. Finally, the operation is implemented for veth and tun device, which give performance improvement in the 6-12% range when forwarding frames from said devices towards a vxlan tunnel. v2: - fix netdev_get_fwd_headroom() behaviour - remove some code duplication with the netdev_set_rx_headroom() and netdev_reset_rx_headroom() helpers - handle headroom reset on [v]port removal/deletion - initialize tun align to the old default value v3: - fix a comment typo ==================== Acked-by: Pravin B Shelar <pshelar@ovn.org> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/tun.c17
-rw-r--r--drivers/net/veth.c26
-rw-r--r--include/linux/netdevice.h31
-rw-r--r--net/bridge/br_if.c37
-rw-r--r--net/openvswitch/datapath.c40
-rw-r--r--net/openvswitch/datapath.h4
-rw-r--r--net/openvswitch/vport-internal_dev.c10
7 files changed, 161 insertions, 4 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 88bb8cc3555b..afdf950617c3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -187,6 +187,7 @@ struct tun_struct {
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
NETIF_F_TSO6|NETIF_F_UFO)
+ int align;
int vnet_hdr_sz;
int sndbuf;
struct tap_filter txflt;
@@ -934,6 +935,17 @@ static void tun_poll_controller(struct net_device *dev)
return;
}
#endif
+
+static void tun_set_headroom(struct net_device *dev, int new_hr)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+
+ if (new_hr < NET_SKB_PAD)
+ new_hr = NET_SKB_PAD;
+
+ tun->align = new_hr;
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
@@ -945,6 +957,7 @@ static const struct net_device_ops tun_netdev_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = tun_poll_controller,
#endif
+ .ndo_set_rx_headroom = tun_set_headroom,
};
static const struct net_device_ops tap_netdev_ops = {
@@ -962,6 +975,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_features_check = passthru_features_check,
+ .ndo_set_rx_headroom = tun_set_headroom,
};
static void tun_flow_init(struct tun_struct *tun)
@@ -1086,7 +1100,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb;
size_t total_len = iov_iter_count(from);
- size_t len = total_len, align = NET_SKB_PAD, linear;
+ size_t len = total_len, align = tun->align, linear;
struct virtio_net_hdr gso = { 0 };
int good_linear;
int copylen;
@@ -1694,6 +1708,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->txflt.count = 0;
tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+ tun->align = NET_SKB_PAD;
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index ba21d072be31..4f30a6ae50d0 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -35,6 +35,7 @@ struct pcpu_vstats {
struct veth_priv {
struct net_device __rcu *peer;
atomic64_t dropped;
+ unsigned requested_headroom;
};
/*
@@ -271,6 +272,29 @@ static int veth_get_iflink(const struct net_device *dev)
return iflink;
}
+static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
+{
+ struct veth_priv *peer_priv, *priv = netdev_priv(dev);
+ struct net_device *peer;
+
+ if (new_hr < 0)
+ new_hr = 0;
+
+ rcu_read_lock();
+ peer = rcu_dereference(priv->peer);
+ if (unlikely(!peer))
+ goto out;
+
+ peer_priv = netdev_priv(peer);
+ priv->requested_headroom = new_hr;
+ new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
+ dev->needed_headroom = new_hr;
+ peer->needed_headroom = new_hr;
+
+out:
+ rcu_read_unlock();
+}
+
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
@@ -285,6 +309,7 @@ static const struct net_device_ops veth_netdev_ops = {
#endif
.ndo_get_iflink = veth_get_iflink,
.ndo_features_check = passthru_features_check,
+ .ndo_set_rx_headroom = veth_set_rx_headroom,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
@@ -301,6 +326,7 @@ static void veth_setup(struct net_device *dev)
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->priv_flags |= IFF_NO_QUEUE;
+ dev->priv_flags |= IFF_PHONY_HEADROOM;
dev->netdev_ops = &veth_netdev_ops;
dev->ethtool_ops = &veth_ethtool_ops;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e52077ffe5ed..efe7cec111fa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1093,6 +1093,12 @@ struct tc_to_netdev {
* This function is used to get egress tunnel information for given skb.
* This is useful for retrieving outer tunnel header parameters while
* sampling packet.
+ * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
+ * This function is used to specify the headroom that the skb must
+ * consider when allocation skb during packet reception. Setting
+ * appropriate rx headroom value allows avoiding skb head copy on
+ * forward. Setting a negative value reset the rx headroom to the
+ * default value.
*
*/
struct net_device_ops {
@@ -1278,6 +1284,8 @@ struct net_device_ops {
bool proto_down);
int (*ndo_fill_metadata_dst)(struct net_device *dev,
struct sk_buff *skb);
+ void (*ndo_set_rx_headroom)(struct net_device *dev,
+ int needed_headroom);
};
/**
@@ -1315,6 +1323,8 @@ struct net_device_ops {
* @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
* @IFF_TEAM: device is a team device
* @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
+ * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
+ * entity (i.e. the master device for bridged veth)
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
@@ -1343,6 +1353,7 @@ enum netdev_priv_flags {
IFF_L3MDEV_SLAVE = 1<<23,
IFF_TEAM = 1<<24,
IFF_RXFH_CONFIGURED = 1<<25,
+ IFF_PHONY_HEADROOM = 1<<26,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
void *accel_priv);
+/* returns the headroom that the master device needs to take in account
+ * when forwarding to this dev
+ */
+static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
+{
+ return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
+}
+
+static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
+{
+ if (dev->netdev_ops->ndo_set_rx_headroom)
+ dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
+}
+
+/* set the device rx headroom to the dev's default */
+static inline void netdev_reset_rx_headroom(struct net_device *dev)
+{
+ netdev_set_rx_headroom(dev, -1);
+}
+
/*
* Net namespace inlines
*/
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index b37a1cc97d98..a73df3315df9 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head)
destroy_nbp(p);
}
+static unsigned get_max_headroom(struct net_bridge *br)
+{
+ unsigned max_headroom = 0;
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
+
+ if (dev_headroom > max_headroom)
+ max_headroom = dev_headroom;
+ }
+
+ return max_headroom;
+}
+
+static void update_headroom(struct net_bridge *br, int new_hr)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list)
+ netdev_set_rx_headroom(p->dev, new_hr);
+
+ br->dev->needed_headroom = new_hr;
+}
+
/* Delete port(interface) from bridge is done in two steps.
* via RCU. First step, marks device as down. That deletes
* all the timers and stops new packets from flowing through.
@@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p)
br_ifinfo_notify(RTM_DELLINK, p);
list_del_rcu(&p->list);
+ if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
+ update_headroom(br, get_max_headroom(br));
+ netdev_reset_rx_headroom(dev);
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
@@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
+ unsigned br_hr, dev_hr;
bool changed_addr;
/* Don't allow bridging non-ethernet like devices, or DSA-enabled
@@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
netdev_update_features(br->dev);
- if (br->dev->needed_headroom < dev->needed_headroom)
- br->dev->needed_headroom = dev->needed_headroom;
+ br_hr = br->dev->needed_headroom;
+ dev_hr = netdev_get_fwd_headroom(dev);
+ if (br_hr < dev_hr)
+ update_headroom(br, dev_hr);
+ else
+ netdev_set_rx_headroom(dev, br_hr);
if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c4e8455d5d56..e6a7d494df24 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net,
return ERR_PTR(-EINVAL);
}
+/* Called with ovs_mutex */
+static void update_headroom(struct datapath *dp)
+{
+ unsigned dev_headroom, max_headroom = 0;
+ struct net_device *dev;
+ struct vport *vport;
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+ dev = vport->dev;
+ dev_headroom = netdev_get_fwd_headroom(dev);
+ if (dev_headroom > max_headroom)
+ max_headroom = dev_headroom;
+ }
+ }
+
+ dp->max_headroom = max_headroom;
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
+ netdev_set_rx_headroom(vport->dev, max_headroom);
+}
+
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
@@ -1973,6 +1996,12 @@ restart:
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+
+ if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
+ update_headroom(dp);
+ else
+ netdev_set_rx_headroom(vport->dev, dp->max_headroom);
+
BUG_ON(err < 0);
ovs_unlock();
@@ -2039,8 +2068,10 @@ exit_unlock_free:
static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
+ bool must_update_headroom = false;
struct nlattr **a = info->attrs;
struct sk_buff *reply;
+ struct datapath *dp;
struct vport *vport;
int err;
@@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
BUG_ON(err < 0);
+
+ /* the vport deletion may trigger dp headroom update */
+ dp = vport->dp;
+ if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
+ must_update_headroom = true;
+ netdev_reset_rx_headroom(vport->dev);
ovs_dp_detach_port(vport);
+
+ if (must_update_headroom)
+ update_headroom(dp);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, reply, info);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 67bdecd9fdc1..427e39a045cf 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -68,6 +68,8 @@ struct dp_stats_percpu {
* ovs_mutex and RCU.
* @stats_percpu: Per-CPU datapath statistics.
* @net: Reference to net namespace.
+ * @max_headroom: the maximum headroom of all vports in this datapath; it will
+ * be used by all the internal vports in this dp.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
@@ -89,6 +91,8 @@ struct datapath {
possible_net_t net;
u32 user_features;
+
+ u32 max_headroom;
};
/**
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index ec76398a792f..83a5534abd31 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
return stats;
}
+void internal_set_rx_headroom(struct net_device *dev, int new_hr)
+{
+ dev->needed_headroom = new_hr;
+}
+
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
@@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_get_stats,
+ .ndo_set_rx_headroom = internal_set_rx_headroom,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
- netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
+ IFF_PHONY_HEADROOM;
netdev->destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
err = -ENOMEM;
goto error_free_netdev;
}
+ vport->dev->needed_headroom = vport->dp->max_headroom;
dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
internal_dev = internal_dev_priv(vport->dev);