diff options
Diffstat (limited to 'net')
305 files changed, 8474 insertions, 5433 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index d2cd9de4b724..69929c05c843 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -261,7 +261,6 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; - new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT); vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ad5e2fd1012c..e416a4038a12 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -621,12 +621,12 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, return features; } -static int vlan_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) +static int vlan_ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - return __ethtool_get_settings(vlan->real_dev, cmd); + return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, @@ -741,7 +741,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev) } static const struct ethtool_ops vlan_ethtool_ops = { - .get_settings = vlan_ethtool_get_settings, + .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, @@ -799,6 +799,7 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; + dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index ae63cf72a953..5f1446c9f098 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -184,12 +184,11 @@ int vlan_proc_add_dev(struct net_device *vlandev) /* * Delete directory entry for VLAN device. */ -int vlan_proc_rem_dev(struct net_device *vlandev) +void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; - return 0; } /****** Proc filesystem entry points ****************************************/ diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h index 063f60a3d5cc..8838a2e92eb6 100644 --- a/net/8021q/vlanproc.h +++ b/net/8021q/vlanproc.h @@ -5,7 +5,7 @@ struct net; int vlan_proc_init(struct net *net); -int vlan_proc_rem_dev(struct net_device *vlandev); +void vlan_proc_rem_dev(struct net_device *vlandev); int vlan_proc_add_dev(struct net_device *vlandev); void vlan_proc_cleanup(struct net *net); @@ -14,7 +14,7 @@ void vlan_proc_cleanup(struct net *net); #define vlan_proc_init(net) (0) #define vlan_proc_cleanup(net) do {} while (0) #define vlan_proc_add_dev(dev) ({(void)(dev), 0; }) -#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; }) +#define vlan_proc_rem_dev(dev) do {} while (0) #endif #endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bced8c074c12..7bc2208b6cc4 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -108,9 +108,7 @@ struct p9_poll_wait { * @unsent_req_list: accounting for requests that haven't been sent * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header - * @rsize: amount to read for current frame - * @rpos: read position in current frame - * @rbuf: current read buffer + * @rc: temporary fcall for reading current frame * @wpos: write position for current frame * @wsize: amount of data to write for current frame * @wbuf: current write buffer @@ -131,9 +129,7 @@ struct p9_conn { struct list_head unsent_req_list; struct p9_req_t *req; char tmp_buf[7]; - int rsize; - int rpos; - char *rbuf; + struct p9_fcall rc; int wpos; int wsize; char *wbuf; @@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work) if (m->err < 0) return; - p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos); + p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset); - if (!m->rbuf) { - m->rbuf = m->tmp_buf; - m->rpos = 0; - m->rsize = 7; /* start by reading header */ + if (!m->rc.sdata) { + m->rc.sdata = m->tmp_buf; + m->rc.offset = 0; + m->rc.capacity = 7; /* start by reading header */ } clear_bit(Rpending, &m->wsched); - p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", - m, m->rpos, m->rsize, m->rsize-m->rpos); - err = p9_fd_read(m->client, m->rbuf + m->rpos, - m->rsize - m->rpos); + p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n", + m, m->rc.offset, m->rc.capacity, + m->rc.capacity - m->rc.offset); + err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset, + m->rc.capacity - m->rc.offset); p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); - if (err == -EAGAIN) { + if (err == -EAGAIN) goto end_clear; - } if (err <= 0) goto error; - m->rpos += err; + m->rc.offset += err; - if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */ - u16 tag; + /* header read in */ + if ((!m->req) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new header\n"); - n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */ - if (n >= m->client->msize) { + err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0); + if (err) { + p9_debug(P9_DEBUG_ERROR, + "error parsing header: %d\n", err); + goto error; + } + + if (m->rc.size >= m->client->msize) { p9_debug(P9_DEBUG_ERROR, - "requested packet size too big: %d\n", n); + "requested packet size too big: %d\n", + m->rc.size); err = -EIO; goto error; } - tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */ p9_debug(P9_DEBUG_TRANS, - "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); + "mux %p pkt: size: %d bytes tag: %d\n", + m, m->rc.size, m->rc.tag); - m->req = p9_tag_lookup(m->client, tag); + m->req = p9_tag_lookup(m->client, m->rc.tag); if (!m->req || (m->req->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", - tag); + m->rc.tag); err = -EIO; goto error; } if (m->req->rc == NULL) { - m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_NOFS); - if (!m->req->rc) { - m->req = NULL; - err = -ENOMEM; - goto error; - } + p9_debug(P9_DEBUG_ERROR, + "No recv fcall for tag %d (req %p), disconnecting!\n", + m->rc.tag, m->req); + m->req = NULL; + err = -EIO; + goto error; } - m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall); - memcpy(m->rbuf, m->tmp_buf, m->rsize); - m->rsize = n; + m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall); + memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); + m->rc.capacity = m->rc.size; } - /* not an else because some packets (like clunk) have no payload */ - if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */ + /* packet is read in + * not an else because some packets (like clunk) have no payload + */ + if ((m->req) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); spin_lock(&m->client->lock); if (m->req->status != REQ_STATUS_ERROR) @@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work) list_del(&m->req->req_list); spin_unlock(&m->client->lock); p9_client_cb(m->client, m->req, status); - m->rbuf = NULL; - m->rpos = 0; - m->rsize = 0; + m->rc.sdata = NULL; + m->rc.offset = 0; + m->rc.capacity = 0; m->req = NULL; } diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 199bc76202d2..4acb1d5417aa 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) mutex_unlock(&virtio_9p_lock); if (!found) { - pr_err("no channels available\n"); + pr_err("no channels available for device %s\n", devname); return ret; } diff --git a/net/Kconfig b/net/Kconfig index 174354618f8a..6c9cfb0d7639 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -392,6 +392,17 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config DST_CACHE + bool "dst cache" + default n + +config NET_DEVLINK + tristate "Network physical/parent device Netlink interface" + help + Network physical/parent device Netlink interface provides + infrastructure to support access to physical chip-wide config and + monitoring. + endif # if NET # Used by archs to tell that they support BPF_JIT diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index d5871ac493eb..f066781be3c8 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1625,7 +1625,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rt = atrtr_find(&at_hint); } - err = ENETUNREACH; + err = -ENETUNREACH; if (!rt) goto out; diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c6fc8f756c9a..2dd40e5ea030 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -12,7 +12,7 @@ config BATMAN_ADV B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The networks may be wired or wireless. See - http://www.open-mesh.org/ for more information and user space + https://www.open-mesh.org/ for more information and user space tools. config BATMAN_ADV_BLA diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 21434ab79d2c..207e2af316c7 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 4e59cf3eb079..a7485d676088 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index df625de55ef2..5651e33ca6bd 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -31,6 +31,7 @@ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/list.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/printk.h> @@ -88,7 +89,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) * in the given ring buffer * @lq_recv: pointer to the ring buffer * - * Returns computed average value. + * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { @@ -132,7 +133,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node) * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * - * Returns 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, int max_if_num) @@ -180,7 +181,7 @@ unlock: * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed * - * Returns 0 on success, a negative error code otherwise. + * Return: 0 on success, a negative error code otherwise. */ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num) @@ -246,7 +247,7 @@ unlock: * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * - * Returns the originator object corresponding to the passed mac address or NULL + * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exists it is created an initialised. */ @@ -286,8 +287,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) free_orig_node: /* free twice, as batadv_orig_node_new sets refcount to 2 */ - batadv_orig_node_free_ref(orig_node); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NULL; } @@ -396,7 +397,14 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) return new_tq; } -/* is there another aggregated packet here? */ +/** + * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached + * @buff_pos: current position in the skb + * @packet_len: total length of the skb + * @tvlv_len: tvlv length of the previously considered OGM + * + * Return: true if there is enough space for another OGM, false otherwise. + */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, __be16 tvlv_len) { @@ -507,7 +515,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -522,7 +530,7 @@ out: * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * - * Returns true if new_packet can be aggregated with forw_packet + * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, @@ -609,7 +617,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return res; } @@ -636,10 +644,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, unsigned char *skb_buff; unsigned int skb_size; - if (!atomic_inc_not_zero(&if_incoming->refcount)) + if (!kref_get_unless_zero(&if_incoming->refcount)) return; - if (!atomic_inc_not_zero(&if_outgoing->refcount)) + if (!kref_get_unless_zero(&if_outgoing->refcount)) goto out_free_incoming; /* own packet should always be scheduled */ @@ -703,9 +711,9 @@ out_nomem: if (!own_packet) atomic_inc(&bat_priv->batman_queue_left); out_free_outgoing: - batadv_hardif_free_ref(if_outgoing); + batadv_hardif_put(if_outgoing); out_free_incoming: - batadv_hardif_free_ref(if_incoming); + batadv_hardif_put(if_incoming); } /* aggregate a new packet into the existing ogm packet */ @@ -950,7 +958,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -995,9 +1003,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && - atomic_inc_not_zero(&tmp_neigh_node->refcount)) { + kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } @@ -1018,7 +1026,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } @@ -1033,7 +1041,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, ethhdr->h_source, orig_node, orig_tmp); - batadv_orig_node_free_ref(orig_tmp); + batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { @@ -1108,13 +1116,13 @@ unlock: rcu_read_unlock(); out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (neigh_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -1125,7 +1133,7 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Returns 1 if the link can be considered bidirectional, 0 otherwise + * Return: 1 if the link can be considered bidirectional, 0 otherwise */ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, @@ -1154,7 +1162,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (tmp_neigh_node->if_incoming != if_incoming) continue; - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; @@ -1184,7 +1192,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } @@ -1257,7 +1265,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -1269,7 +1277,7 @@ out: * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Returns duplicate status as enum batadv_dup_status + * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, @@ -1298,7 +1306,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return 0; } @@ -1308,7 +1316,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, - &orig_ifinfo->batman_seqno_reset)) { + BATADV_TQ_LOCAL_WINDOW_SIZE, + &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } @@ -1344,7 +1353,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); @@ -1358,8 +1367,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - batadv_orig_node_free_ref(orig_node); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_node_put(orig_node); + batadv_orig_ifinfo_put(orig_ifinfo); return ret; } @@ -1505,7 +1514,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, ogm_packet, if_incoming, if_outgoing, dup_status); } - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) @@ -1554,18 +1563,18 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, out_neigh: if ((orig_neigh_node) && (!is_single_hop_neigh)) - batadv_orig_node_free_ref(orig_neigh_node); + batadv_orig_node_put(orig_neigh_node); out: if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_router) - batadv_neigh_node_free_ref(router_router); + batadv_neigh_node_put(router_router); if (orig_neigh_router) - batadv_neigh_node_free_ref(orig_neigh_router); + batadv_neigh_node_put(orig_neigh_router); if (hardif_neigh) - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); kfree_skb(skb_priv); } @@ -1688,7 +1697,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); - batadv_orig_node_free_ref(orig_neigh_node); + batadv_orig_node_put(orig_neigh_node); return; } @@ -1726,7 +1735,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, } rcu_read_unlock(); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static int batadv_iv_ogm_receive(struct sk_buff *skb, @@ -1796,7 +1805,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node, neigh_node->addr, n_ifinfo->bat_iv.tq_avg); - batadv_neigh_ifinfo_free_ref(n_ifinfo); + batadv_neigh_ifinfo_put(n_ifinfo); } } @@ -1859,9 +1868,9 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, batman_count++; next: - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (n_ifinfo) - batadv_neigh_ifinfo_free_ref(n_ifinfo); + batadv_neigh_ifinfo_put(n_ifinfo); } rcu_read_unlock(); } @@ -1929,7 +1938,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * - * Returns a value less, equal to or greater than 0 if the metric via neigh1 is + * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, @@ -1955,9 +1964,9 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, out: if (neigh1_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); if (neigh2_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return diff; } @@ -1970,7 +1979,7 @@ out: * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * - * Returns true if the metric via neigh1 is equally good or better than + * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool @@ -1998,9 +2007,9 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, out: if (neigh1_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); if (neigh2_ifinfo) - batadv_neigh_ifinfo_free_ref(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 25cbc36e997a..b56bb000a0ab 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -29,10 +29,16 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE); } -/* receive and process one packet within the sequence number window. +/** + * batadv_bit_get_packet - receive and process one packet within the sequence + * number window + * @priv: the bat priv with all the soft interface information + * @seq_bits: pointer to the sequence number receive packet + * @seq_num_diff: difference between the current/received sequence number and + * the last sequence number + * @set_mark: whether this packet should be marked in seq_bits * - * returns: - * 1 if the window was moved (either new or very old) + * Return: 1 if the window was moved (either new or very old), * 0 if the window was not moved/shifted. */ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 0226b220fe5b..3e41bb80eb81 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -24,7 +24,14 @@ #include <linux/compiler.h> #include <linux/types.h> -/* Returns 1 if the corresponding bit in the given seq_bits indicates true +/** + * batadv_test_bit - check if bit is set in the current window + * + * @seq_bits: pointer to the sequence number receive packet + * @last_seqno: latest sequence number in seq_bits + * @curr_seqno: sequence number to test for + * + * Return: 1 if the corresponding bit in the given seq_bits indicates true * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, @@ -48,9 +55,6 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) set_bit(n, seq_bits); /* turn the position on */ } -/* receive and process one packet, returns 1 if received seq_num is considered - * new, 0 if old - */ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, int set_mark); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index d5d71ac96c8a..0a6c8b824a00 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -31,6 +31,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -58,7 +59,13 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); -/* return the index of the claim */ +/** + * batadv_choose_claim - choose the right bucket for a claim. + * @data: data to hash + * @size: size of the hash table + * + * Return: the hash index of the claim + */ static inline u32 batadv_choose_claim(const void *data, u32 size) { struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; @@ -70,7 +77,13 @@ static inline u32 batadv_choose_claim(const void *data, u32 size) return hash % size; } -/* return the index of the backbone gateway */ +/** + * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway. + * @data: data to hash + * @size: size of the hash table + * + * Return: the hash index of the backbone gateway + */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; @@ -82,7 +95,13 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) return hash % size; } -/* compares address and vid of two backbone gws */ +/** + * batadv_compare_backbone_gw - compare address and vid of two backbone gws + * @node: list node of the first entry to compare + * @data2: pointer to the second backbone gateway + * + * Return: 1 if the backbones have the same data, 0 otherwise + */ static int batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { @@ -100,7 +119,13 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, return 1; } -/* compares address and vid of two claims */ +/** + * batadv_compare_backbone_gw - compare address and vid of two claims + * @node: list node of the first entry to compare + * @data2: pointer to the second claims + * + * Return: 1 if the claim have the same data, 0 otherwise + */ static int batadv_compare_claim(const struct hlist_node *node, const void *data2) { @@ -118,39 +143,62 @@ static int batadv_compare_claim(const struct hlist_node *node, return 1; } -/* free a backbone gw */ -static void -batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) +/** + * batadv_backbone_gw_release - release backbone gw from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the backbone gw + */ +static void batadv_backbone_gw_release(struct kref *ref) +{ + struct batadv_bla_backbone_gw *backbone_gw; + + backbone_gw = container_of(ref, struct batadv_bla_backbone_gw, + refcount); + + kfree_rcu(backbone_gw, rcu); +} + +/** + * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly + * release it + * @backbone_gw: backbone gateway to be free'd + */ +static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { - if (atomic_dec_and_test(&backbone_gw->refcount)) - kfree_rcu(backbone_gw, rcu); + kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } -/* finally deinitialize the claim */ -static void batadv_claim_free_rcu(struct rcu_head *rcu) +/** + * batadv_claim_release - release claim from lists and queue for free after rcu + * grace period + * @ref: kref pointer of the claim + */ +static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; - claim = container_of(rcu, struct batadv_bla_claim, rcu); + claim = container_of(ref, struct batadv_bla_claim, refcount); - batadv_backbone_gw_free_ref(claim->backbone_gw); - kfree(claim); + batadv_backbone_gw_put(claim->backbone_gw); + kfree_rcu(claim, rcu); } -/* free a claim, call claim_free_rcu if its the last reference */ -static void batadv_claim_free_ref(struct batadv_bla_claim *claim) +/** + * batadv_claim_put - decrement the claim refcounter and possibly + * release it + * @claim: claim to be free'd + */ +static void batadv_claim_put(struct batadv_bla_claim *claim) { - if (atomic_dec_and_test(&claim->refcount)) - call_rcu(&claim->rcu, batadv_claim_free_rcu); + kref_put(&claim->refcount, batadv_claim_release); } /** - * batadv_claim_hash_find + * batadv_claim_hash_find - looks for a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @data: search data (may be local/static data) * - * looks for a claim in the hash, and returns it if found - * or NULL otherwise. + * Return: claim if found or NULL otherwise. */ static struct batadv_bla_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv, @@ -173,7 +221,7 @@ static struct batadv_bla_claim if (!batadv_compare_claim(&claim->hash_entry, data)) continue; - if (!atomic_inc_not_zero(&claim->refcount)) + if (!kref_get_unless_zero(&claim->refcount)) continue; claim_tmp = claim; @@ -185,12 +233,12 @@ static struct batadv_bla_claim } /** - * batadv_backbone_hash_find - looks for a claim in the hash + * batadv_backbone_hash_find - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @vid: the VLAN ID * - * Returns claim if found or NULL otherwise. + * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, @@ -217,7 +265,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, &search_entry)) continue; - if (!atomic_inc_not_zero(&backbone_gw->refcount)) + if (!kref_get_unless_zero(&backbone_gw->refcount)) continue; backbone_gw_tmp = backbone_gw; @@ -228,7 +276,10 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, return backbone_gw_tmp; } -/* delete all claims for a backbone */ +/** + * batadv_bla_del_backbone_claims - delete all claims for a backbone + * @backbone_gw: backbone gateway where the claims should be removed + */ static void batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { @@ -253,7 +304,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_free_ref(claim); + batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); @@ -372,18 +423,17 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, netif_rx(skb); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** - * batadv_bla_get_backbone_gw + * batadv_bla_get_backbone_gw - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local * - * searches for the backbone gw or creates a new one if it could not - * be found. + * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, @@ -416,7 +466,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, ether_addr_copy(entry->orig, orig); /* one for the hash, one for returning */ - atomic_set(&entry->refcount, 2); + kref_init(&entry->refcount); + kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, @@ -434,7 +485,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, if (orig_node) { batadv_tt_global_del_orig(bat_priv, orig_node, vid, "became a backbone gateway"); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } if (own_backbone) { @@ -449,7 +500,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, return entry; } -/* update or add the own backbone gw to make sure we announce +/** + * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the selected primary interface + * @vid: VLAN identifier + * + * update or add the own backbone gw to make sure we announce * where we receive other backbone gws */ static void @@ -466,7 +523,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, return; backbone_gw->lasttime = jiffies; - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } /** @@ -515,7 +572,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, /* finally, send an announcement frame */ batadv_bla_send_announce(bat_priv, backbone_gw); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } /** @@ -546,12 +603,9 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) } /** - * batadv_bla_send_announce + * batadv_bla_send_announce - Send an announcement frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: our backbone gateway which should be announced - * - * This function sends an announcement. It is called from multiple - * places. */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) @@ -599,7 +653,8 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, claim->lasttime = jiffies; claim->backbone_gw = backbone_gw; - atomic_set(&claim->refcount, 2); + kref_init(&claim->refcount); + kref_get(&claim->refcount); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", mac, BATADV_PRINT_VID(vid)); @@ -626,10 +681,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&claim->backbone_gw->crc_lock); - batadv_backbone_gw_free_ref(claim->backbone_gw); + batadv_backbone_gw_put(claim->backbone_gw); } /* set (new) backbone gw */ - atomic_inc(&backbone_gw->refcount); + kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; spin_lock_bh(&backbone_gw->crc_lock); @@ -638,11 +693,14 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, backbone_gw->lasttime = jiffies; claim_free_ref: - batadv_claim_free_ref(claim); + batadv_claim_put(claim); } -/* Delete a claim from the claim hash which has the - * given mac address and vid. +/** + * batadv_bla_del_claim - delete a claim from the claim hash + * @bat_priv: the bat priv with all the soft interface information + * @mac: mac address of the claim to be removed + * @vid: VLAN id for the claim to be removed */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) @@ -660,17 +718,25 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); - batadv_claim_free_ref(claim); /* reference from the hash is gone */ + batadv_claim_put(claim); /* reference from the hash is gone */ spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&claim->backbone_gw->crc_lock); /* don't need the reference from hash_find() anymore */ - batadv_claim_free_ref(claim); + batadv_claim_put(claim); } -/* check for ANNOUNCE frame, return 1 if handled */ +/** + * batadv_handle_announce - check for ANNOUNCE frame + * @bat_priv: the bat priv with all the soft interface information + * @an_addr: announcement mac address (ARP Sender HW address) + * @backbone_addr: originator address of the sender (Ethernet source MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { @@ -716,11 +782,20 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, } } - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } -/* check for REQUEST frame, return 1 if handled */ +/** + * batadv_handle_request - check for REQUEST frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: backbone address to be requested (ARP sender HW MAC) + * @ethhdr: ethernet header of a packet + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, struct ethhdr *ethhdr, @@ -744,7 +819,16 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, return 1; } -/* check for UNCLAIM frame, return 1 if handled */ +/** + * batadv_handle_unclaim - check for UNCLAIM frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: originator address of the backbone (Ethernet source) + * @claim_addr: Client to be unclaimed (ARP sender HW MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, u8 *claim_addr, @@ -769,11 +853,20 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } -/* check for CLAIM frame, return 1 if handled */ +/** + * batadv_handle_claim - check for CLAIM frame + * @bat_priv: the bat priv with all the soft interface information + * @primary_if: the primary hard interface of this batman soft interface + * @backbone_addr: originator address of the backbone (Ethernet Source) + * @claim_addr: client mac address to be claimed (ARP sender HW MAC) + * @vid: the VLAN ID of the frame + * + * Return: 1 if handled + */ static int batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, u8 *claim_addr, @@ -797,12 +890,12 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, /* TODO: we could call something like tt_local_del() here. */ - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } /** - * batadv_check_claim_group + * batadv_check_claim_group - check for claim group membership * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header @@ -813,7 +906,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, * This function also applies the group ID of the sender * if it is in the same mesh. * - * returns: + * Return: * 2 - if it is a claim packet and on the same group * 1 - if is a claim packet from another group * 0 - if it is not a claim packet @@ -871,20 +964,18 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, bla_dst_own->group = bla_dst->group; } - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return 2; } /** - * batadv_bla_process_claim + * batadv_bla_process_claim - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked * - * Check if this is a claim frame, and process it accordingly. - * - * returns 1 if it was a claim frame, otherwise return 0 to + * Return: 1 if it was a claim frame, otherwise return 0 to * tell the callee that it can use the frame on its own. */ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, @@ -1015,7 +1106,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, return 1; } -/* Check when we last heard from other nodes, and remove them in case of +/** + * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or + * immediately + * @bat_priv: the bat priv with all the soft interface information + * @now: whether the whole hash shall be wiped now + * + * Check when we last heard from other nodes, and remove them in case of * a time out, or clean all backbone gws if now is set. */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) @@ -1056,14 +1153,14 @@ purge_now: batadv_bla_del_backbone_claims(backbone_gw); hlist_del_rcu(&backbone_gw->hash_entry); - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); } spin_unlock_bh(list_lock); } } /** - * batadv_bla_purge_claims + * batadv_bla_purge_claims - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now @@ -1112,12 +1209,11 @@ purge_now: } /** - * batadv_bla_update_orig_address + * batadv_bla_update_orig_address - Update the backbone gateways when the own + * originator address changes * @bat_priv: the bat priv with all the soft interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL - * - * Update the backbone gateways when the own orig address changes. */ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, @@ -1185,10 +1281,14 @@ void batadv_bla_status_update(struct net_device *net_dev) * so just call that one. */ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } -/* periodic work to do: +/** + * batadv_bla_periodic_work - performs periodic bla work + * @work: kernel work struct + * + * periodic work to do: * * purge structures when they are too old * * send announcements */ @@ -1255,7 +1355,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); @@ -1269,7 +1369,12 @@ out: static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; -/* initialize all bla structures */ +/** + * batadv_bla_init - initialize all bla structures + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success, < 0 on error. + */ int batadv_bla_init(struct batadv_priv *bat_priv) { int i; @@ -1289,7 +1394,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) if (primary_if) { crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN); bat_priv->bla.claim_dest.group = htons(crc); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } else { bat_priv->bla.claim_dest.group = 0; /* will be set later */ } @@ -1324,7 +1429,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** - * batadv_bla_check_bcast_duplist + * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the bcast_packet to be checked * @@ -1336,6 +1441,8 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. + * + * Return: 1 if a packet is in the duplicate list, 0 otherwise. */ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -1394,14 +1501,13 @@ out: } /** - * batadv_bla_is_backbone_gw_orig + * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for + * the VLAN identified by vid. * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address * @vid: VLAN identifier * - * Check if the originator is a gateway for the VLAN identified by vid. - * - * Returns true if orig is a backbone for this vid, false otherwise. + * Return: true if orig is a backbone for this vid, false otherwise. */ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) @@ -1435,14 +1541,13 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, } /** - * batadv_bla_is_backbone_gw + * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN. * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * - * bla_is_backbone_gw inspects the skb for the VLAN ID and returns 1 - * if the orig_node is also a gateway on the soft interface, otherwise it - * returns 0. + * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise + * it returns 0. */ int batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) @@ -1465,11 +1570,16 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, if (!backbone_gw) return 0; - batadv_backbone_gw_free_ref(backbone_gw); + batadv_backbone_gw_put(backbone_gw); return 1; } -/* free all bla structures (for softinterface free or module unload) */ +/** + * batadv_bla_init - free all bla structures + * @bat_priv: the bat priv with all the soft interface information + * + * for softinterface free or module unload + */ void batadv_bla_free(struct batadv_priv *bat_priv) { struct batadv_hard_iface *primary_if; @@ -1488,22 +1598,23 @@ void batadv_bla_free(struct batadv_priv *bat_priv) bat_priv->bla.backbone_hash = NULL; } if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** - * batadv_bla_rx + * batadv_bla_rx - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @is_bcast: the packet came in a broadcast packet type. * - * bla_rx avoidance checks if: + * batadv_bla_rx avoidance checks if: * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function and - * returns 1, otherwise it returns 0 and the caller shall further + * in these cases, the skb is further handled by this function + * + * Return: 1 if handled, otherwise it returns 0 and the caller shall further * process the skb. */ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -1580,27 +1691,28 @@ handled: out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (claim) - batadv_claim_free_ref(claim); + batadv_claim_put(claim); return ret; } /** - * batadv_bla_tx + * batadv_bla_tx - check packets going into the mesh * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * - * bla_tx checks if: + * batadv_bla_tx checks if: * * a claim was received which has to be processed * * the frame is allowed on the mesh * - * in these cases, the skb is further handled by this function and - * returns 1, otherwise it returns 0 and the caller shall further - * process the skb. + * in these cases, the skb is further handled by this function. * * This call might reallocate skb data. + * + * Return: 1 if handled, otherwise it returns 0 and the caller shall further + * process the skb. */ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) @@ -1668,12 +1780,19 @@ handled: ret = 1; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (claim) - batadv_claim_free_ref(claim); + batadv_claim_put(claim); return ret; } +/** + * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1719,10 +1838,18 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } +/** + * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq + * file + * @seq: seq file to print on + * @offset: not used + * + * Return: always 0 + */ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1776,6 +1903,6 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 7ea199b8b5ab..579f0fa6fe6a 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 037ad0a5f485..48253cf8341b 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -281,6 +281,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file) * originator table of an hard interface * @inode: inode pointer to debugfs file * @file: pointer to the seq_file + * + * Return: 0 on success or negative error number in case of failure */ static int batadv_originators_hardif_open(struct inode *inode, struct file *file) @@ -329,6 +331,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode, * batadv_dat_cache_open - Prepare file handler for reads from dat_chache * @inode: inode which was opened * @file: file handle to be initialized + * + * Return: 0 on success or negative error number in case of failure */ static int batadv_dat_cache_open(struct inode *inode, struct file *file) { @@ -483,6 +487,8 @@ void batadv_debugfs_destroy(void) * batadv_debugfs_add_hardif - creates the base directory for a hard interface * in debugfs. * @hard_iface: hard interface which should be added. + * + * Return: 0 on success or negative error number in case of failure */ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 80ab8d6f0ab3..1ab4e2e63afc 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index a49c705fb86b..4c9b69d465a6 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -30,6 +30,7 @@ #include <linux/in.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -62,21 +63,34 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly - * free it - * @dat_entry: the entry to free + * batadv_dat_entry_release - release dat_entry from lists and queue for free + * after rcu grace period + * @ref: kref pointer of the dat_entry */ -static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry) +static void batadv_dat_entry_release(struct kref *ref) { - if (atomic_dec_and_test(&dat_entry->refcount)) - kfree_rcu(dat_entry, rcu); + struct batadv_dat_entry *dat_entry; + + dat_entry = container_of(ref, struct batadv_dat_entry, refcount); + + kfree_rcu(dat_entry, rcu); +} + +/** + * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly + * release it + * @dat_entry: dat_entry to be free'd + */ +static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) +{ + kref_put(&dat_entry->refcount, batadv_dat_entry_release); } /** * batadv_dat_to_purge - check whether a dat_entry has to be purged or not * @dat_entry: the entry to check * - * Returns true if the entry has to be purged now, false otherwise. + * Return: true if the entry has to be purged now, false otherwise. */ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) { @@ -121,7 +135,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, continue; hlist_del_rcu(&dat_entry->hash_entry); - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); } spin_unlock_bh(list_lock); } @@ -151,7 +165,7 @@ static void batadv_dat_purge(struct work_struct *work) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entries are the same, 0 otherwise. + * Return: 1 if the two entries are the same, 0 otherwise. */ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) { @@ -166,7 +180,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_src field in the ARP packet. + * Return: the value of the hw_src field in the ARP packet. */ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { @@ -183,7 +197,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_src field in the ARP packet. + * Return: the value of the ip_src field in the ARP packet. */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { @@ -195,7 +209,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_dst field in the ARP packet. + * Return: the value of the hw_dst field in the ARP packet. */ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { @@ -207,7 +221,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_dst field in the ARP packet. + * Return: the value of the ip_dst field in the ARP packet. */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { @@ -219,7 +233,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data. + * Return: the selected index in the hash table for the given data. */ static u32 batadv_hash_dat(const void *data, u32 size) { @@ -256,7 +270,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) * @ip: search key * @vid: VLAN identifier * - * Returns the dat_entry if found, NULL otherwise. + * Return: the dat_entry if found, NULL otherwise. */ static struct batadv_dat_entry * batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, @@ -281,7 +295,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, if (dat_entry->ip != ip) continue; - if (!atomic_inc_not_zero(&dat_entry->refcount)) + if (!kref_get_unless_zero(&dat_entry->refcount)) continue; dat_entry_tmp = dat_entry; @@ -326,7 +340,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, dat_entry->vid = vid; ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; - atomic_set(&dat_entry->refcount, 2); + kref_init(&dat_entry->refcount); + kref_get(&dat_entry->refcount); hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, batadv_hash_dat, dat_entry, @@ -334,7 +349,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); goto out; } @@ -343,7 +358,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); } #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -440,7 +455,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, * @candidate: orig_node under evaluation * @max_orig_node: last selected candidate * - * Returns true if the node has been elected as next candidate or false + * Return: true if the node has been elected as next candidate or false * otherwise. */ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, @@ -527,12 +542,12 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, max_orig_node)) continue; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) continue; max = tmp_max; if (max_orig_node) - batadv_orig_node_free_ref(max_orig_node); + batadv_orig_node_put(max_orig_node); max_orig_node = orig_node; } rcu_read_unlock(); @@ -558,7 +573,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * closest values (from the LEFT, with wrap around if needed) then the hash * value of the key. ip_dst is the key. * - * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM. + * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) @@ -602,7 +617,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) * This function copies the skb with pskb_copy() and is sent as unicast packet * to each of the selected candidates. * - * Returns true if the packet is sent to at least one candidate, false + * Return: true if the packet is sent to at least one candidate, false * otherwise. */ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, @@ -659,9 +674,9 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, ret = true; } free_neigh: - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); free_orig: - batadv_orig_node_free_ref(cand[i].orig_node); + batadv_orig_node_put(cand[i].orig_node); } out: @@ -741,6 +756,8 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) /** * batadv_dat_init - initialise the DAT internals * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 in case of success, a negative error code otherwise */ int batadv_dat_init(struct batadv_priv *bat_priv) { @@ -779,6 +796,8 @@ void batadv_dat_free(struct batadv_priv *bat_priv) * batadv_dat_cache_seq_print_text - print the local DAT hash table * @seq: seq file to print on * @offset: not used + * + * Return: always 0 */ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) { @@ -821,7 +840,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -831,7 +850,7 @@ out: * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb * - * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise. + * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ static u16 batadv_arp_get_type(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -904,8 +923,9 @@ out: * @skb: the buffer containing the packet to extract the VID from * @hdr_size: the size of the batman-adv header encapsulating the packet * - * If the packet embedded in the skb is vlan tagged this function returns the - * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned. + * Return: If the packet embedded in the skb is vlan tagged this function + * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS + * is returned. */ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) { @@ -930,7 +950,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check * - * Returns true if the message has been sent to the dht candidates, false + * Return: true if the message has been sent to the dht candidates, false * otherwise. In case of a positive return value the message has to be enqueued * to permit the fallback. */ @@ -1009,7 +1029,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, } out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } @@ -1020,7 +1040,7 @@ out: * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the request has been answered, false otherwise. + * Return: true if the request has been answered, false otherwise. */ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -1089,7 +1109,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, } out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); if (ret) kfree_skb(skb); return ret; @@ -1143,7 +1163,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the packet was snooped and consumed by DAT. False if the + * Return: true if the packet was snooped and consumed by DAT. False if the * packet has to be delivered to the interface */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, @@ -1200,7 +1220,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the broadcast packet * - * Returns true if the node can drop the packet, false otherwise. + * Return: true if the node can drop the packet, false otherwise. */ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) @@ -1242,6 +1262,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, out: if (dat_entry) - batadv_dat_entry_free_ref(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 26d4a525a798..813ecea96cf9 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 20d9282f895b..adb9c3989add 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -85,7 +85,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, /** * batadv_frag_size_limit - maximum possible size of packet to be fragmented * - * Returns the maximum size of payload that can be fragmented. + * Return: the maximum size of payload that can be fragmented. */ static int batadv_frag_size_limit(void) { @@ -107,7 +107,7 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Returns true if chain is empty and caller can just insert the new fragment + * Return: true if chain is empty and caller can just insert the new fragment * without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, @@ -136,7 +136,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, * Insert a new fragment into the reverse ordered chain in the right table * entry. The hash table entry is cleared if "old" fragments exist in it. * - * Returns true if skb is buffered, false on error. If the chain has all the + * Return: true if skb is buffered, false on error. If the chain has all the * fragments needed to merge the packet, the chain is moved to the passed head * to avoid locking the chain in the table. */ @@ -242,12 +242,11 @@ err: /** * batadv_frag_merge_packets - merge a chain of fragments * @chain: head of chain with fragments - * @skb: packet with total size of skb after merging * * Expand the first skb in the chain and copy the content of the remaining * skb's into the expanded one. After doing so, clear the chain. * - * Returns the merged skb or NULL on error. + * Return: the merged skb or NULL on error. */ static struct sk_buff * batadv_frag_merge_packets(struct hlist_head *chain) @@ -307,6 +306,9 @@ free: * There are three possible outcomes: 1) Packet is merged: Return true and * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and leave skb as is. + * + * Return: true when packet is merged or buffered, false when skb is not not + * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, struct batadv_orig_node *orig_node_src) @@ -344,7 +346,7 @@ out_err: * will exceed the MTU towards the next-hop. If so, the fragment is forwarded * without merging it. * - * Returns true if the fragment is consumed/forwarded, false otherwise. + * Return: true if the fragment is consumed/forwarded, false otherwise. */ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_hard_iface *recv_if, @@ -383,9 +385,9 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, out: if (orig_node_dst) - batadv_orig_node_free_ref(orig_node_dst); + batadv_orig_node_put(orig_node_dst); if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -399,7 +401,7 @@ out: * passed mtu and the old one with the rest. The new skb contains data from the * tail of the old skb. * - * Returns the new fragment, NULL on error. + * Return: the new fragment, NULL on error. */ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, struct batadv_frag_packet *frag_head, @@ -433,7 +435,7 @@ err: * @orig_node: final destination of the created fragments * @neigh_node: next-hop of the created fragments * - * Returns true on success, false otherwise. + * Return: true on success, false otherwise. */ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node, @@ -510,7 +512,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, out_err: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return ret; } diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 8b9877e70b95..9ff77c7ef7c7 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -42,7 +42,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb, * batadv_frag_check_entry - check if a list of fragments has timed out * @frags_entry: table entry to check * - * Returns true if the frags entry has timed out, false otherwise. + * Return: true if the frags entry has timed out, false otherwise. */ static inline bool batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index e6c8382c79ba..c59aff5ccac8 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -28,6 +28,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rculist.h> @@ -59,12 +60,28 @@ */ #define BATADV_DHCP_CHADDR_OFFSET 28 -static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node) +/** + * batadv_gw_node_release - release gw_node from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the gw_node + */ +static void batadv_gw_node_release(struct kref *ref) { - if (atomic_dec_and_test(&gw_node->refcount)) { - batadv_orig_node_free_ref(gw_node->orig_node); - kfree_rcu(gw_node, rcu); - } + struct batadv_gw_node *gw_node; + + gw_node = container_of(ref, struct batadv_gw_node, refcount); + + batadv_orig_node_put(gw_node->orig_node); + kfree_rcu(gw_node, rcu); +} + +/** + * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it + * @gw_node: gateway node to free + */ +static void batadv_gw_node_put(struct batadv_gw_node *gw_node) +{ + kref_put(&gw_node->refcount, batadv_gw_node_release); } static struct batadv_gw_node * @@ -77,7 +94,7 @@ batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) if (!gw_node) goto out; - if (!atomic_inc_not_zero(&gw_node->refcount)) + if (!kref_get_unless_zero(&gw_node->refcount)) gw_node = NULL; out: @@ -100,14 +117,14 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) if (!orig_node) goto unlock; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; unlock: rcu_read_unlock(); out: if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); return orig_node; } @@ -118,14 +135,14 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->gw.list_lock); - if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount)) + if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount)) new_gw_node = NULL; curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); if (curr_gw_node) - batadv_gw_node_free_ref(curr_gw_node); + batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -170,7 +187,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (!router_ifinfo) goto next; - if (!atomic_inc_not_zero(&gw_node->refcount)) + if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; @@ -186,9 +203,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) ((tmp_gw_factor == max_gw_factor) && (tq_avg > max_tq))) { if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; - atomic_inc(&curr_gw->refcount); + kref_get(&curr_gw->refcount); } break; @@ -201,9 +218,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) */ if (tq_avg > max_tq) { if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; - atomic_inc(&curr_gw->refcount); + kref_get(&curr_gw->refcount); } break; } @@ -214,12 +231,12 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); next: - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); @@ -255,7 +272,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) */ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); } void batadv_gw_election(struct batadv_priv *bat_priv) @@ -330,13 +347,13 @@ void batadv_gw_election(struct batadv_priv *bat_priv) out: if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); if (next_gw) - batadv_gw_node_free_ref(next_gw); + batadv_gw_node_put(next_gw); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } void batadv_gw_check_election(struct batadv_priv *bat_priv, @@ -397,15 +414,15 @@ reselect: batadv_gw_reselect(bat_priv); out: if (curr_gw_orig) - batadv_orig_node_free_ref(curr_gw_orig); + batadv_orig_node_put(curr_gw_orig); if (router_gw) - batadv_neigh_node_free_ref(router_gw); + batadv_neigh_node_put(router_gw); if (router_orig) - batadv_neigh_node_free_ref(router_orig); + batadv_neigh_node_put(router_orig); if (router_gw_tq) - batadv_neigh_ifinfo_free_ref(router_gw_tq); + batadv_neigh_ifinfo_put(router_gw_tq); if (router_orig_tq) - batadv_neigh_ifinfo_free_ref(router_orig_tq); + batadv_neigh_ifinfo_put(router_orig_tq); } /** @@ -423,12 +440,12 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) return; gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); if (!gw_node) { - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return; } @@ -436,7 +453,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - atomic_set(&gw_node->refcount, 1); + kref_init(&gw_node->refcount); spin_lock_bh(&bat_priv->gw.list_lock); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); @@ -456,7 +473,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * - * Returns gateway node if found or NULL otherwise. + * Return: gateway node if found or NULL otherwise. */ static struct batadv_gw_node * batadv_gw_node_get(struct batadv_priv *bat_priv, @@ -469,7 +486,7 @@ batadv_gw_node_get(struct batadv_priv *bat_priv, if (gw_node_tmp->orig_node != orig_node) continue; - if (!atomic_inc_not_zero(&gw_node_tmp->refcount)) + if (!kref_get_unless_zero(&gw_node_tmp->refcount)) continue; gw_node = gw_node_tmp; @@ -527,22 +544,23 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); - hlist_del_init_rcu(&gw_node->list); + if (!hlist_unhashed(&gw_node->list)) { + hlist_del_init_rcu(&gw_node->list); + batadv_gw_node_put(gw_node); + } spin_unlock_bh(&bat_priv->gw.list_lock); - batadv_gw_node_free_ref(gw_node); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); } out: if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } void batadv_gw_node_delete(struct batadv_priv *bat_priv, @@ -565,7 +583,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { hlist_del_init_rcu(&gw_node->list); - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); } spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -602,12 +620,12 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv, ret = seq_has_overflowed(seq) ? -1 : 0; if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); out: if (router_ifinfo) - batadv_neigh_ifinfo_free_ref(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); return ret; } @@ -644,7 +662,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -655,13 +673,13 @@ out: * @chaddr: buffer where the client address will be stored. Valid * only if the function returns BATADV_DHCP_TO_CLIENT * - * Returns: + * This function may re-allocate the data buffer of the skb passed as argument. + * + * Return: * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error * while parsing it * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client - * - * This function may re-allocate the data buffer of the skb passed as argument. */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, @@ -776,11 +794,11 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, * server. Due to topology changes it may be the case that the GW server * previously selected is not the best one anymore. * - * Returns true if the packet destination is unicast and it is not the best gw, - * false otherwise. - * * This call might reallocate skb data. * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. + * + * Return: true if the packet destination is unicast and it is not the best gw, + * false otherwise. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -838,7 +856,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, goto out; curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; - batadv_neigh_ifinfo_free_ref(curr_ifinfo); + batadv_neigh_ifinfo_put(curr_ifinfo); break; case BATADV_GW_MODE_OFF: @@ -856,18 +874,18 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; - batadv_neigh_ifinfo_free_ref(old_ifinfo); + batadv_neigh_ifinfo_put(old_ifinfo); out: if (orig_dst_node) - batadv_orig_node_free_ref(orig_dst_node); + batadv_orig_node_put(orig_dst_node); if (curr_gw) - batadv_gw_node_free_ref(curr_gw); + batadv_gw_node_put(curr_gw); if (gw_node) - batadv_gw_node_free_ref(gw_node); + batadv_gw_node_put(gw_node); if (neigh_old) - batadv_neigh_node_free_ref(neigh_old); + batadv_neigh_node_put(neigh_old); if (neigh_curr) - batadv_neigh_node_free_ref(neigh_curr); + batadv_neigh_node_put(neigh_curr); return out_of_range; } diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index fa9527785ed3..582dd8c413c8 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b51bface8bdd..5ee04f7140af 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -38,7 +38,7 @@ * @description: text shown when throughput string cannot be parsed * @throughput: pointer holding the returned throughput information * - * Returns false on parse error and true otherwise. + * Return: false on parse error and true otherwise. */ static bool batadv_parse_throughput(struct net_device *net_dev, char *buff, const char *description, u32 *throughput) diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index ab893e318229..b58346350024 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 01acccc4d218..b22b2775a0a5 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,7 @@ #include "hard-interface.h" #include "main.h" +#include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> @@ -26,6 +27,7 @@ #include <linux/if_ether.h> #include <linux/if.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -47,13 +49,19 @@ #include "sysfs.h" #include "translation-table.h" -void batadv_hardif_free_rcu(struct rcu_head *rcu) +/** + * batadv_hardif_release - release hard interface from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the hard interface + */ +void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; - hard_iface = container_of(rcu, struct batadv_hard_iface, rcu); + hard_iface = container_of(ref, struct batadv_hard_iface, refcount); dev_put(hard_iface->net_dev); - kfree(hard_iface); + + kfree_rcu(hard_iface, rcu); } struct batadv_hard_iface * @@ -64,7 +72,7 @@ batadv_hardif_get_by_netdev(const struct net_device *net_dev) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && - atomic_inc_not_zero(&hard_iface->refcount)) + kref_get_unless_zero(&hard_iface->refcount)) goto out; } @@ -76,6 +84,28 @@ out: } /** + * batadv_mutual_parents - check if two devices are each others parent + * @dev1: 1st net_device + * @dev2: 2nd net_device + * + * veth devices come in pairs and each is the parent of the other! + * + * Return: true if the devices are each others parent, otherwise false + */ +static bool batadv_mutual_parents(const struct net_device *dev1, + const struct net_device *dev2) +{ + int dev1_parent_iflink = dev_get_iflink(dev1); + int dev2_parent_iflink = dev_get_iflink(dev2); + + if (!dev1_parent_iflink || !dev2_parent_iflink) + return false; + + return (dev1_parent_iflink == dev2->ifindex) && + (dev2_parent_iflink == dev1->ifindex); +} + +/** * batadv_is_on_batman_iface - check if a device is a batman iface descendant * @net_dev: the device to check * @@ -85,7 +115,7 @@ out: * This function recursively checks all the fathers of the device passed as * argument looking for a batman-adv soft interface. * - * Returns true if the device is descendant of a batman-adv mesh interface (or + * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) @@ -108,6 +138,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) if (WARN(!parent_dev, "Cannot find parent device")) return false; + if (batadv_mutual_parents(net_dev, parent_dev)) + return false; + ret = batadv_is_on_batman_iface(parent_dev); return ret; @@ -136,7 +169,7 @@ static int batadv_is_valid_iface(const struct net_device *net_dev) * interface * @net_device: the device to check * - * Returns true if the net device is a 802.11 wireless device, false otherwise. + * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_netdev(struct net_device *net_device) { @@ -169,7 +202,7 @@ batadv_hardif_get_active(const struct net_device *soft_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && - atomic_inc_not_zero(&hard_iface->refcount)) + kref_get_unless_zero(&hard_iface->refcount)) goto out; } @@ -193,7 +226,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, @@ -203,7 +236,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, ASSERT_RTNL(); - if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount)) + if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount)) new_hard_iface = NULL; curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); @@ -217,7 +250,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, out: if (curr_hard_iface) - batadv_hardif_free_ref(curr_hard_iface); + batadv_hardif_put(curr_hard_iface); } static bool @@ -376,7 +409,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void @@ -401,7 +434,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * * Invoke ndo_del_slave on master passing slave as argument. In this way slave * is free'd and master can correctly change its internal state. - * Return 0 on success, a negative value representing the error otherwise + * + * Return: 0 on success, a negative value representing the error otherwise */ static int batadv_master_del_slave(struct batadv_hard_iface *slave, struct net_device *master) @@ -430,7 +464,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) goto out; soft_iface = dev_get_by_name(&init_net, iface_name); @@ -528,7 +562,7 @@ err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); err: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -559,7 +593,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_primary_if_select(bat_priv, new_if); if (new_if) - batadv_hardif_free_ref(new_if); + batadv_hardif_put(new_if); } bat_priv->bat_algo_ops->bat_iface_disable(hard_iface); @@ -582,11 +616,11 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, } hard_iface->soft_iface = NULL; - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } /** @@ -605,7 +639,7 @@ static void batadv_hardif_remove_interface_finish(struct work_struct *work) batadv_debugfs_del_hardif(hard_iface); batadv_sysfs_del_hardif(&hard_iface->hardif_obj); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); } static struct batadv_hard_iface * @@ -651,7 +685,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; /* extra reference for return */ - atomic_set(&hard_iface->refcount, 2); + kref_init(&hard_iface->refcount); + kref_get(&hard_iface->refcount); batadv_check_known_mac_addr(hard_iface->net_dev); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); @@ -759,10 +794,10 @@ static int batadv_hard_if_event(struct notifier_block *this, } hardif_put: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NOTIFY_DONE; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 5a31420513e1..d74f1983f33e 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -20,8 +20,8 @@ #include "main.h" -#include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/kref.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> @@ -61,30 +61,16 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, void batadv_hardif_remove_interfaces(void); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); -void batadv_hardif_free_rcu(struct rcu_head *rcu); +void batadv_hardif_release(struct kref *ref); /** - * batadv_hardif_free_ref - decrement the hard interface refcounter and - * possibly free it + * batadv_hardif_put - decrement the hard interface refcounter and possibly + * release it * @hard_iface: the hard interface to free */ -static inline void -batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) +static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { - if (atomic_dec_and_test(&hard_iface->refcount)) - call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); -} - -/** - * batadv_hardif_free_ref_now - decrement the hard interface refcounter and - * possibly free it (without rcu callback) - * @hard_iface: the hard interface to free - */ -static inline void -batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface) -{ - if (atomic_dec_and_test(&hard_iface->refcount)) - batadv_hardif_free_rcu(&hard_iface->rcu); + kref_put(&hard_iface->refcount, batadv_hardif_release); } static inline struct batadv_hard_iface * @@ -97,7 +83,7 @@ batadv_primary_if_get_selected(struct batadv_priv *bat_priv) if (!hard_iface) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) hard_iface = NULL; out: diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 2ea6a18d793f..a0a0fdb85805 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 377626250ac7..9bb57b87447c 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -30,14 +30,17 @@ struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their - * keys, return 0 if same and not 0 if not same + * keys + * + * Return: 0 if same and not 0 if not same */ typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, const void *); -/* the hashfunction, should return an index - * based on the key in the data of the first - * argument and the size the second +/* the hashfunction + * + * Return: an index based on the key in the data of the first argument and the + * size the second */ typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); @@ -96,7 +99,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, * @data: data passed to the aforementioned callbacks as argument * @data_node: to be added element * - * Returns 0 on success, 1 if the element already is in the hash + * Return: 0 on success, 1 if the element already is in the hash * and -1 on error. */ static inline int batadv_hash_add(struct batadv_hashtable *hash, @@ -139,10 +142,11 @@ out: return ret; } -/* removes data from hash, if found. returns pointer do data on success, so you - * can remove the used structure yourself, or NULL on error . data could be the - * structure you use with just the key filled, we just need the key for - * comparing. +/* removes data from hash, if found. data could be the structure you use with + * just the key filled, we just need the key for comparing. + * + * Return: returns pointer do data on success, so you can remove the used + * structure yourself, or NULL on error */ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index bcabb5e3f4d3..6268f08b7154 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -288,11 +288,11 @@ free_skb: kfree_skb(skb); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return len; } diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index e937143f0b10..618d5de06f20 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 4b5d61fbadb1..e3d7051747b0 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -29,6 +29,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/module.h> @@ -233,7 +234,7 @@ void batadv_mesh_free(struct net_device *soft_iface) * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check * - * Returns 'true' if the mac address was found, false otherwise. + * Return: 'true' if the mac address was found, false otherwise. */ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { @@ -262,7 +263,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) * function that requires the primary interface * @seq: debugfs table seq_file struct * - * Returns primary interface if found or NULL otherwise. + * Return: primary interface if found or NULL otherwise. */ struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq) @@ -286,7 +287,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq) seq_printf(seq, "BATMAN mesh %s disabled - primary interface not active\n", net_dev->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); primary_if = NULL; out: @@ -297,7 +298,7 @@ out: * batadv_max_header_len - calculate maximum encapsulation overhead for a * payload packet * - * Return the maximum encapsulation overhead in bytes. + * Return: the maximum encapsulation overhead in bytes. */ int batadv_max_header_len(void) { @@ -599,6 +600,8 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. + * + * Return: big endian crc32c of the checksummed data */ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { @@ -622,15 +625,26 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) } /** - * batadv_tvlv_handler_free_ref - decrement the tvlv handler refcounter and - * possibly free it + * batadv_tvlv_handler_release - release tvlv handler from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_handler_release(struct kref *ref) +{ + struct batadv_tvlv_handler *tvlv_handler; + + tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); + kfree_rcu(tvlv_handler, rcu); +} + +/** + * batadv_tvlv_handler_put - decrement the tvlv container refcounter and + * possibly release it * @tvlv_handler: the tvlv handler to free */ -static void -batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) +static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { - if (atomic_dec_and_test(&tvlv_handler->refcount)) - kfree_rcu(tvlv_handler, rcu); + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } /** @@ -640,7 +654,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) * @type: tvlv handler type to look for * @version: tvlv handler version to look for * - * Returns tvlv handler if found or NULL otherwise. + * Return: tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler *batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) @@ -656,7 +670,7 @@ static struct batadv_tvlv_handler if (tvlv_handler_tmp->version != version) continue; - if (!atomic_inc_not_zero(&tvlv_handler_tmp->refcount)) + if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) continue; tvlv_handler = tvlv_handler_tmp; @@ -668,14 +682,25 @@ static struct batadv_tvlv_handler } /** - * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and - * possibly free it + * batadv_tvlv_container_release - release tvlv from lists and free + * @ref: kref pointer of the tvlv + */ +static void batadv_tvlv_container_release(struct kref *ref) +{ + struct batadv_tvlv_container *tvlv; + + tvlv = container_of(ref, struct batadv_tvlv_container, refcount); + kfree(tvlv); +} + +/** + * batadv_tvlv_container_put - decrement the tvlv container refcounter and + * possibly release it * @tvlv: the tvlv container to free */ -static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) +static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { - if (atomic_dec_and_test(&tvlv->refcount)) - kfree(tvlv); + kref_put(&tvlv->refcount, batadv_tvlv_container_release); } /** @@ -688,13 +713,15 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * - * Returns tvlv container if found or NULL otherwise. + * Return: tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container *batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { if (tvlv_tmp->tvlv_hdr.type != type) continue; @@ -702,7 +729,7 @@ static struct batadv_tvlv_container if (tvlv_tmp->tvlv_hdr.version != version) continue; - if (!atomic_inc_not_zero(&tvlv_tmp->refcount)) + if (!kref_get_unless_zero(&tvlv_tmp->refcount)) continue; tvlv = tvlv_tmp; @@ -720,13 +747,15 @@ static struct batadv_tvlv_container * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * - * Returns size of all currently registered tvlv containers in bytes. + * Return: size of all currently registered tvlv containers in bytes. */ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; u16 tvlv_len = 0; + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); + hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); tvlv_len += ntohs(tvlv->tvlv_hdr.len); @@ -755,8 +784,8 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, hlist_del(&tvlv->list); /* first call to decrement the counter, second call to free */ - batadv_tvlv_container_free_ref(tvlv); - batadv_tvlv_container_free_ref(tvlv); + batadv_tvlv_container_put(tvlv); + batadv_tvlv_container_put(tvlv); } /** @@ -808,7 +837,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); INIT_HLIST_NODE(&tvlv_new->list); - atomic_set(&tvlv_new->refcount, 1); + kref_init(&tvlv_new->refcount); spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); @@ -826,7 +855,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, * @additional_packet_len: requested additional packet size on top of minimum * size * - * Returns true of the packet buffer could be changed to the requested size, + * Return: true of the packet buffer could be changed to the requested size, * false otherwise. */ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, @@ -862,7 +891,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * - * Returns size of all appended tvlv containers in bytes. + * Return: size of all appended tvlv containers in bytes. */ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, @@ -915,7 +944,7 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Returns success if handler was not found or the return value of the handler + * Return: success if handler was not found or the return value of the handler * callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, @@ -968,7 +997,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Returns success when processing an OGM or the return value of all called + * Return: success when processing an OGM or the return value of all called * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, @@ -1001,7 +1030,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, src, dst, tvlv_value, tvlv_value_cont_len); if (tvlv_handler) - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } @@ -1081,7 +1110,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); return; } @@ -1094,7 +1123,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; - atomic_set(&tvlv_handler->refcount, 1); + kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); @@ -1118,11 +1147,11 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, if (!tvlv_handler) return; - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_del_rcu(&tvlv_handler->list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); - batadv_tvlv_handler_free_ref(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); } /** @@ -1182,7 +1211,7 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) kfree_skb(skb); out: - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } /** @@ -1190,8 +1219,8 @@ out: * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * - * If the packet embedded in the skb is vlan tagged this function returns the - * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned. + * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the + * skb is vlan tagged. Otherwise BATADV_NO_FLAGS. */ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) { @@ -1218,7 +1247,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Returns true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN idenfied by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) @@ -1232,7 +1261,7 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } return ap_isolation_enabled; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 9dbd9107e7e1..8c01f54c61f3 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -35,6 +35,9 @@ /* Time To Live of broadcast messages */ #define BATADV_TTL 50 +/* maximum sequence number age of broadcast messages */ +#define BATADV_BCAST_MAX_AGE 64 + /* purge originators after time in seconds if no valid packet comes in * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ @@ -97,11 +100,6 @@ */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 -/* how much worse secondary interfaces may be to be considered as bonding - * candidates - */ -#define BATADV_BONDING_TQ_THRESHOLD 50 - /* should not be bigger than 512 bytes or change the size of * forw_packet->direct_link_flags */ @@ -273,9 +271,14 @@ static inline void _batadv_dbg(int type __always_unused, pr_err("%s: " fmt, _netdev->name, ## arg); \ } while (0) -/* returns 1 if they are the same ethernet addr +/** + * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses + * @data1: Pointer to a six-byte array containing the Ethernet address + * @data2: Pointer other six-byte array containing the Ethernet address * * note: can't use ether_addr_equal() as it requires aligned memory + * + * Return: 1 if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { @@ -287,7 +290,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2) * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * - * Returns true if current time is after timestamp + timeout + * Return: true if current time is after timestamp + timeout */ static inline bool batadv_has_timed_out(unsigned long timestamp, unsigned int timeout) @@ -326,7 +329,13 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) -/* Sum and return the cpu-local counters for index 'idx' */ +/** + * batadv_sum_counter - Sum the cpu-local counters for index 'idx' + * @bat_priv: the bat priv with all the soft interface information + * @idx: index of counter to sum up + * + * Return: sum of all cpu-local counters + */ static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index eb76386f8d4b..8caa2c72efa3 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -30,6 +30,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -55,7 +56,7 @@ * Collect multicast addresses of the local multicast listeners * on the given soft interface, dev, in the given mcast_list. * - * Returns -ENOMEM on memory allocation error or the number of + * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get(struct net_device *dev, @@ -87,7 +88,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * - * Returns true if the given address is already in the given list. + * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, @@ -195,8 +196,9 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, * batadv_mcast_has_bridge - check whether the soft-iface is bridged * @bat_priv: the bat priv with all the soft interface information * - * Checks whether there is a bridge on top of our soft interface. Returns - * true if so, false otherwise. + * Checks whether there is a bridge on top of our soft interface. + * + * Return: true if there is a bridge, false otherwise. */ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) { @@ -218,7 +220,7 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. * - * Returns true if the tvlv container is registered afterwards. Otherwise + * Return: true if the tvlv container is registered afterwards. Otherwise * returns false. */ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) @@ -289,8 +291,8 @@ out: * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of - * memory allocation failure. + * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory + * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -327,8 +329,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out - * of memory. + * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -366,8 +367,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * - * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out - * of memory. + * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -398,7 +398,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * - * Returns the number of nodes which want all IPv4 multicast traffic if the + * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ @@ -421,7 +421,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination * - * Returns an orig_node matching the multicast address provided by ethhdr + * Return: an orig_node matching the multicast address provided by ethhdr * via a translation table lookup. This increases the returned nodes refcount. */ static struct batadv_orig_node * @@ -436,7 +436,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and * increases its refcount. */ static struct batadv_orig_node * @@ -448,7 +448,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -463,7 +463,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set * and increases its refcount. */ static struct batadv_orig_node * @@ -475,7 +475,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -491,7 +491,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and * increases its refcount. */ @@ -514,7 +514,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * - * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag + * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag * set and increases its refcount. */ static struct batadv_orig_node * @@ -526,7 +526,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_unsnoopables_list, mcast_want_all_unsnoopables_node) { - if (!atomic_inc_not_zero(&tmp_orig_node->refcount)) + if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; @@ -543,7 +543,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) * @skb: The multicast packet to check * @orig: an originator to be set to forward the skb to * - * Returns the forwarding mode as enum batadv_forw_mode and in case of + * Return: the forwarding mode as enum batadv_forw_mode and in case of * BATADV_FORW_SINGLE set the orig to the single originator the skb * should be forwarded to. */ @@ -802,7 +802,9 @@ void batadv_mcast_free(struct batadv_priv *bat_priv) batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + spin_lock_bh(&bat_priv->tt.commit_lock); batadv_mcast_mla_tt_retract(bat_priv, NULL); + spin_unlock_bh(&bat_priv->tt.commit_lock); } /** diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 8f3cb04b9f13..80bceec55592 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -23,7 +23,7 @@ struct sk_buff; /** - * batadv_forw_mode - the way a packet should be forwarded as + * enum batadv_forw_mode - the way a packet should be forwarded as * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic * flooding) * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c98b0ab85449..d253bb23e2ac 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -32,6 +32,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -64,6 +65,8 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, /** * batadv_nc_init - one-time initialization for network coding + * + * Return: 0 on success or negative error number in case of failure */ int __init batadv_nc_init(void) { @@ -142,6 +145,8 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_nc_mesh_init - initialise coding hash table and start house keeping * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure */ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) { @@ -203,39 +208,52 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) } /** - * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove - * its refcount on the orig_node - * @rcu: rcu pointer of the nc node + * batadv_nc_node_release - release nc_node from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the nc_node */ -static void batadv_nc_node_free_rcu(struct rcu_head *rcu) +static void batadv_nc_node_release(struct kref *ref) { struct batadv_nc_node *nc_node; - nc_node = container_of(rcu, struct batadv_nc_node, rcu); - batadv_orig_node_free_ref(nc_node->orig_node); - kfree(nc_node); + nc_node = container_of(ref, struct batadv_nc_node, refcount); + + batadv_orig_node_put(nc_node->orig_node); + kfree_rcu(nc_node, rcu); +} + +/** + * batadv_nc_node_put - decrement the nc_node refcounter and possibly + * release it + * @nc_node: nc_node to be free'd + */ +static void batadv_nc_node_put(struct batadv_nc_node *nc_node) +{ + kref_put(&nc_node->refcount, batadv_nc_node_release); } /** - * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly - * frees it - * @nc_node: the nc node to free + * batadv_nc_path_release - release nc_path from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the nc_path */ -static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) +static void batadv_nc_path_release(struct kref *ref) { - if (atomic_dec_and_test(&nc_node->refcount)) - call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu); + struct batadv_nc_path *nc_path; + + nc_path = container_of(ref, struct batadv_nc_path, refcount); + + kfree_rcu(nc_path, rcu); } /** - * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly - * frees it - * @nc_path: the nc node to free + * batadv_nc_path_put - decrement the nc_path refcounter and possibly + * release it + * @nc_path: nc_path to be free'd */ -static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) +static void batadv_nc_path_put(struct batadv_nc_path *nc_path) { - if (atomic_dec_and_test(&nc_path->refcount)) - kfree_rcu(nc_path, rcu); + kref_put(&nc_path->refcount, batadv_nc_path_release); } /** @@ -245,7 +263,7 @@ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) { kfree_skb(nc_packet->skb); - batadv_nc_path_free_ref(nc_packet->nc_path); + batadv_nc_path_put(nc_packet->nc_path); kfree(nc_packet); } @@ -254,7 +272,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) * @bat_priv: the bat priv with all the soft interface information * @nc_node: the nc node to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, struct batadv_nc_node *nc_node) @@ -270,7 +288,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) @@ -290,7 +308,7 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @nc_path: the nc path to check * - * Returns true if the entry has to be purged now, false otherwise + * Return: true if the entry has to be purged now, false otherwise */ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, struct batadv_nc_path *nc_path) @@ -338,7 +356,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, "Removing nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); list_del_rcu(&nc_node->list); - batadv_nc_node_free_ref(nc_node); + batadv_nc_node_put(nc_node); } spin_unlock_bh(lock); } @@ -449,7 +467,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, "Remove nc_path %pM -> %pM\n", nc_path->prev_hop, nc_path->next_hop); hlist_del_rcu(&nc_path->hash_entry); - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); } spin_unlock_bh(lock); } @@ -473,7 +491,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data. + * Return: the selected index in the hash table for the given data. */ static u32 batadv_nc_hash_choose(const void *data, u32 size) { @@ -492,7 +510,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entry are the same, 0 otherwise + * Return: 1 if the two entry are the same, 0 otherwise */ static int batadv_nc_hash_compare(const struct hlist_node *node, const void *data2) @@ -519,7 +537,7 @@ static int batadv_nc_hash_compare(const struct hlist_node *node, * @hash: hash table containing the nc path * @data: search key * - * Returns the nc_path if found, NULL otherwise. + * Return: the nc_path if found, NULL otherwise. */ static struct batadv_nc_path * batadv_nc_hash_find(struct batadv_hashtable *hash, @@ -540,7 +558,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash, if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) continue; - if (!atomic_inc_not_zero(&nc_path->refcount)) + if (!kref_get_unless_zero(&nc_path->refcount)) continue; nc_path_tmp = nc_path; @@ -574,7 +592,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) * timeout. If so, the packet is no longer kept and the entry deleted from the * queue. Has to be called with the appropriate locks. * - * Returns false as soon as the entry in the fifo queue has not been timed out + * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, @@ -613,7 +631,7 @@ out: * packet is no longer delayed, immediately sent and the entry deleted from the * queue. Has to be called with the appropriate locks. * - * Returns false as soon as the entry in the fifo queue has not been timed out + * Return: false as soon as the entry in the fifo queue has not been timed out * yet and true otherwise. */ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, @@ -734,7 +752,7 @@ static void batadv_nc_worker(struct work_struct *work) * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks * - * Returns true if: + * Return: true if: * 1) The OGM must have the most recent sequence number. * 2) The TTL must be decremented by one and only one. * 3) The OGM must be received from the first hop from orig_node. @@ -754,7 +772,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, last_ttl = orig_ifinfo->last_ttl; last_real_seqno = orig_ifinfo->last_real_seqno; - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); if (last_real_seqno != ntohl(ogm_packet->seqno)) return false; @@ -775,7 +793,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * - * Returns the nc_node if found, NULL otherwise. + * Return: the nc_node if found, NULL otherwise. */ static struct batadv_nc_node *batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, @@ -796,7 +814,7 @@ static struct batadv_nc_node if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) continue; - if (!atomic_inc_not_zero(&nc_node->refcount)) + if (!kref_get_unless_zero(&nc_node->refcount)) continue; /* Found a match */ @@ -817,7 +835,7 @@ static struct batadv_nc_node * (can be equal to orig_node) * @in_coding: traverse incoming or outgoing network coding list * - * Returns the nc_node if found or created, NULL in case of an error. + * Return: the nc_node if found or created, NULL in case of an error. */ static struct batadv_nc_node *batadv_nc_get_nc_node(struct batadv_priv *bat_priv, @@ -840,14 +858,15 @@ static struct batadv_nc_node if (!nc_node) return NULL; - if (!atomic_inc_not_zero(&orig_neigh_node->refcount)) + if (!kref_get_unless_zero(&orig_neigh_node->refcount)) goto free; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); ether_addr_copy(nc_node->addr, orig_node->orig); nc_node->orig_node = orig_neigh_node; - atomic_set(&nc_node->refcount, 2); + kref_init(&nc_node->refcount); + kref_get(&nc_node->refcount); /* Select ingoing or outgoing coding node */ if (in_coding) { @@ -923,9 +942,9 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, out: if (in_nc_node) - batadv_nc_node_free_ref(in_nc_node); + batadv_nc_node_put(in_nc_node); if (out_nc_node) - batadv_nc_node_free_ref(out_nc_node); + batadv_nc_node_put(out_nc_node); } /** @@ -935,7 +954,7 @@ out: * @src: ethernet source address - first half of the nc path search key * @dst: ethernet destination address - second half of the nc path search key * - * Returns pointer to nc_path if the path was found or created, returns NULL + * Return: pointer to nc_path if the path was found or created, returns NULL * on error. */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, @@ -966,7 +985,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, /* Initialize nc_path */ INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); - atomic_set(&nc_path->refcount, 2); + kref_init(&nc_path->refcount); + kref_get(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); @@ -992,6 +1012,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value + * + * Return: scaled tq value */ static u8 batadv_nc_random_weight_tq(u8 tq) { @@ -1032,7 +1054,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) * @nc_packet: structure containing the packet to the skb can be coded with * @neigh_node: next hop to forward packet to * - * Returns true if both packets are consumed, false otherwise. + * Return: true if both packets are consumed, false otherwise. */ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -1206,13 +1228,13 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, res = true; out: if (router_neigh) - batadv_neigh_node_free_ref(router_neigh); + batadv_neigh_node_put(router_neigh); if (router_coding) - batadv_neigh_node_free_ref(router_coding); + batadv_neigh_node_put(router_coding); if (router_neigh_ifinfo) - batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo); + batadv_neigh_ifinfo_put(router_neigh_ifinfo); if (router_coding_ifinfo) - batadv_neigh_ifinfo_free_ref(router_coding_ifinfo); + batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } @@ -1231,7 +1253,7 @@ out: * Since the source encoded the packet we can be certain it has all necessary * decode information. * - * Returns true if coding of a decoded packet is allowed. + * Return: true if coding of a decoded packet is allowed. */ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { @@ -1249,7 +1271,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) * @skb: data skb to forward * @eth_dst: next hop mac address of skb * - * Returns true if coding of a decoded skb is allowed. + * Return: true if coding of a decoded skb is allowed. */ static struct batadv_nc_packet * batadv_nc_path_search(struct batadv_priv *bat_priv, @@ -1317,7 +1339,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, * @eth_src: source mac address of skb * @in_nc_node: pointer to skb next hop's neighbor nc node * - * Returns an nc packet if a suitable coding packet was found, NULL otherwise. + * Return: an nc packet if a suitable coding packet was found, NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, @@ -1350,7 +1372,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, } rcu_read_unlock(); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return nc_packet; } @@ -1400,7 +1422,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * next hop that potentially sent a packet which our next hop also received * (overheard) and has stored for later decoding. * - * Returns true if the skb was consumed (encoded packet sent) or false otherwise + * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, struct batadv_neigh_node *neigh_node, @@ -1454,7 +1476,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, * @neigh_node: next hop to forward packet to * @packet_id: checksum to identify packet * - * Returns true if the packet was buffered or false in case of an error. + * Return: true if the packet was buffered or false in case of an error. */ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, struct batadv_nc_path *nc_path, @@ -1488,7 +1510,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, * @skb: data skb to forward * @neigh_node: next hop to forward packet to * - * Returns true if the skb was consumed (encoded packet sent) or false otherwise + * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ bool batadv_nc_skb_forward(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) @@ -1533,7 +1555,7 @@ bool batadv_nc_skb_forward(struct sk_buff *skb, return true; free_nc_path: - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); out: /* Packet is not consumed */ return false; @@ -1595,7 +1617,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, free_skb: kfree_skb(skb); free_nc_path: - batadv_nc_path_free_ref(nc_path); + batadv_nc_path_put(nc_path); out: return; } @@ -1627,7 +1649,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, * @skb: unicast skb to decode * @nc_packet: decode data needed to decode the skb * - * Returns pointer to decoded unicast packet if the packet was decoded or NULL + * Return: pointer to decoded unicast packet if the packet was decoded or NULL * in case of an error. */ static struct batadv_unicast_packet * @@ -1721,7 +1743,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, * @ethhdr: pointer to the ethernet header inside the coded packet * @coded: coded packet we try to find decode data for * - * Returns pointer to nc packet if the needed data was found or NULL otherwise. + * Return: pointer to nc packet if the needed data was found or NULL otherwise. */ static struct batadv_nc_packet * batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, @@ -1784,6 +1806,9 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, * resulting unicast packet * @skb: incoming coded packet * @recv_if: pointer to interface this packet was received on + * + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * otherwise. */ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) @@ -1868,6 +1893,8 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) * batadv_nc_nodes_seq_print_text - print the nc node information * @seq: seq file to print on * @offset: not used + * + * Return: always 0 */ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) { @@ -1923,13 +1950,15 @@ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } /** * batadv_nc_init_debugfs - create nc folder and related files in debugfs * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure */ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 8f6d4ad8778a..d6d7fb4ec5d5 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 3c782a33bdac..e4cbb0753e37 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,11 +18,13 @@ #include "originator.h" #include "main.h" +#include <linux/atomic.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -47,7 +49,13 @@ static struct lock_class_key batadv_orig_hash_lock_class_key; static void batadv_purge_orig(struct work_struct *work); -/* returns 1 if they are the same originator */ +/** + * batadv_compare_orig - comparing function used in the originator hash table + * @node: node in the local table + * @data2: second object to compare the node to + * + * Return: 1 if they are the same originator + */ int batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, @@ -61,7 +69,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2) * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * - * Returns the vlan object identified by vid and belonging to orig_node or NULL + * Return: the vlan object identified by vid and belonging to orig_node or NULL * if it does not exist. */ struct batadv_orig_node_vlan * @@ -75,7 +83,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, if (tmp->vid != vid) continue; - if (!atomic_inc_not_zero(&tmp->refcount)) + if (!kref_get_unless_zero(&tmp->refcount)) continue; vlan = tmp; @@ -93,7 +101,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, * @orig_node: the originator serving the VLAN * @vid: the VLAN identifier * - * Returns NULL in case of failure or the vlan object identified by vid and + * Return: NULL in case of failure or the vlan object identified by vid and * belonging to orig_node otherwise. The object is created and added to the list * if it does not exist. * @@ -116,7 +124,8 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, if (!vlan) goto out; - atomic_set(&vlan->refcount, 2); + kref_init(&vlan->refcount); + kref_get(&vlan->refcount); vlan->vid = vid; hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); @@ -128,14 +137,27 @@ out: } /** - * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly free + * batadv_orig_node_vlan_release - release originator-vlan object from lists + * and queue for free after rcu grace period + * @ref: kref pointer of the originator-vlan object + */ +static void batadv_orig_node_vlan_release(struct kref *ref) +{ + struct batadv_orig_node_vlan *orig_vlan; + + orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount); + + kfree_rcu(orig_vlan, rcu); +} + +/** + * batadv_orig_node_vlan_put - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ -void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan) +void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { - if (atomic_dec_and_test(&orig_vlan->refcount)) - kfree_rcu(orig_vlan, rcu); + kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } int batadv_originator_init(struct batadv_priv *bat_priv) @@ -163,90 +185,68 @@ err: } /** - * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object - * @rcu: rcu pointer of the neigh_ifinfo object + * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the neigh_ifinfo */ -static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu) +static void batadv_neigh_ifinfo_release(struct kref *ref) { struct batadv_neigh_ifinfo *neigh_ifinfo; - neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu); + neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount); if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing); + batadv_hardif_put(neigh_ifinfo->if_outgoing); - kfree(neigh_ifinfo); + kfree_rcu(neigh_ifinfo, rcu); } /** - * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free - * the neigh_ifinfo (without rcu callback) - * @neigh_ifinfo: the neigh_ifinfo object to release - */ -static void -batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo) -{ - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu); -} - -/** - * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free + * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) +void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu); + kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** - * batadv_hardif_neigh_free_rcu - free the hardif neigh_node - * @rcu: rcu pointer of the neigh_node + * batadv_hardif_neigh_release - release hardif neigh node from lists and + * queue for free after rcu grace period + * @ref: kref pointer of the neigh_node */ -static void batadv_hardif_neigh_free_rcu(struct rcu_head *rcu) +static void batadv_hardif_neigh_release(struct kref *ref) { struct batadv_hardif_neigh_node *hardif_neigh; - hardif_neigh = container_of(rcu, struct batadv_hardif_neigh_node, rcu); + hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node, + refcount); spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); hlist_del_init_rcu(&hardif_neigh->list); spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); - batadv_hardif_free_ref_now(hardif_neigh->if_incoming); - kfree(hardif_neigh); + batadv_hardif_put(hardif_neigh->if_incoming); + kfree_rcu(hardif_neigh, rcu); } /** - * batadv_hardif_neigh_free_now - decrement the hardif neighbors refcounter - * and possibly free it (without rcu callback) + * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter + * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ -static void -batadv_hardif_neigh_free_now(struct batadv_hardif_neigh_node *hardif_neigh) +void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&hardif_neigh->refcount)) - batadv_hardif_neigh_free_rcu(&hardif_neigh->rcu); + kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** - * batadv_hardif_neigh_free_ref - decrement the hardif neighbors refcounter - * and possibly free it - * @hardif_neigh: hardif neigh neighbor to free + * batadv_neigh_node_release - release neigh_node from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the neigh_node */ -void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) -{ - if (atomic_dec_and_test(&hardif_neigh->refcount)) - call_rcu(&hardif_neigh->rcu, batadv_hardif_neigh_free_rcu); -} - -/** - * batadv_neigh_node_free_rcu - free the neigh_node - * @rcu: rcu pointer of the neigh_node - */ -static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) +static void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; @@ -254,51 +254,38 @@ static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; - neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); + neigh_node = container_of(ref, struct batadv_neigh_node, refcount); bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { - batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, neigh_node->addr); if (hardif_neigh) { /* batadv_hardif_neigh_get() increases refcount too */ - batadv_hardif_neigh_free_now(hardif_neigh); - batadv_hardif_neigh_free_now(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); } if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); - batadv_hardif_free_ref_now(neigh_node->if_incoming); + batadv_hardif_put(neigh_node->if_incoming); - kfree(neigh_node); -} - -/** - * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter - * and possibly free it (without rcu callback) - * @neigh_node: neigh neighbor to free - */ -static void -batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node) -{ - if (atomic_dec_and_test(&neigh_node->refcount)) - batadv_neigh_node_free_rcu(&neigh_node->rcu); + kfree_rcu(neigh_node, rcu); } /** - * batadv_neigh_node_free_ref - decrement the neighbors refcounter - * and possibly free it + * batadv_neigh_node_put - decrement the neighbors refcounter and possibly + * release it * @neigh_node: neigh neighbor to free */ -void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) +void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { - if (atomic_dec_and_test(&neigh_node->refcount)) - call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu); + kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** @@ -307,7 +294,7 @@ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Returns the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -327,7 +314,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, break; } - if (router && !atomic_inc_not_zero(&router->refcount)) + if (router && !kref_get_unless_zero(&router->refcount)) router = NULL; rcu_read_unlock(); @@ -339,7 +326,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns the requested orig_ifinfo or NULL if not found. + * Return: the requested orig_ifinfo or NULL if not found. * * The object is returned with refcounter increased by 1. */ @@ -355,7 +342,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, if (tmp->if_outgoing != if_outgoing) continue; - if (!atomic_inc_not_zero(&tmp->refcount)) + if (!kref_get_unless_zero(&tmp->refcount)) continue; orig_ifinfo = tmp; @@ -371,7 +358,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing + * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing * interface otherwise. The object is created and added to the list * if it does not exist. * @@ -395,7 +382,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, goto out; if (if_outgoing != BATADV_IF_DEFAULT && - !atomic_inc_not_zero(&if_outgoing->refcount)) { + !kref_get_unless_zero(&if_outgoing->refcount)) { kfree(orig_ifinfo); orig_ifinfo = NULL; goto out; @@ -406,7 +393,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, orig_ifinfo->batman_seqno_reset = reset_time; orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); - atomic_set(&orig_ifinfo->refcount, 2); + kref_init(&orig_ifinfo->refcount); + kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); out: @@ -416,12 +404,12 @@ out: /** * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node - * @neigh_node: the neigh node to be queried + * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * * The object is returned with refcounter increased by 1. * - * Returns the requested neigh_ifinfo or NULL if not found + * Return: the requested neigh_ifinfo or NULL if not found */ struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, @@ -436,7 +424,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, if (tmp_neigh_ifinfo->if_outgoing != if_outgoing) continue; - if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount)) continue; neigh_ifinfo = tmp_neigh_ifinfo; @@ -449,10 +437,10 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, /** * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object - * @neigh_node: the neigh node to be queried + * @neigh: the neigh node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired * - * Returns NULL in case of failure or the neigh_ifinfo object for the + * Return: NULL in case of failure or the neigh_ifinfo object for the * if_outgoing interface otherwise. The object is created and added to the list * if it does not exist. * @@ -474,14 +462,15 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, if (!neigh_ifinfo) goto out; - if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) { + if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) { kfree(neigh_ifinfo); neigh_ifinfo = NULL; goto out; } INIT_HLIST_NODE(&neigh_ifinfo->list); - atomic_set(&neigh_ifinfo->refcount, 2); + kref_init(&neigh_ifinfo->refcount); + kref_get(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); @@ -500,7 +489,8 @@ out: * * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. - * Returns NULL if the neighbour is not found. + * + * Return: neighbor when found. Othwerwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -517,7 +507,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, if (tmp_neigh_node->if_incoming != hard_iface) continue; - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; res = tmp_neigh_node; @@ -533,7 +523,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * - * Returns the hardif neighbour node if found or created or NULL otherwise. + * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, @@ -549,12 +539,12 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, if (hardif_neigh) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) + if (!kref_get_unless_zero(&hard_iface->refcount)) goto out; hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); if (!hardif_neigh) { - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); goto out; } @@ -563,7 +553,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, hardif_neigh->if_incoming = hard_iface; hardif_neigh->last_seen = jiffies; - atomic_set(&hardif_neigh->refcount, 1); + kref_init(&hardif_neigh->refcount); if (bat_priv->bat_algo_ops->bat_hardif_neigh_init) bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh); @@ -581,7 +571,7 @@ out: * @hard_iface: the interface this neighbour is connected to * @neigh_addr: the interface address of the neighbour to retrieve * - * Returns the hardif neighbour node if found or created or NULL otherwise. + * Return: the hardif neighbour node if found or created or NULL otherwise. */ static struct batadv_hardif_neigh_node * batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, @@ -603,7 +593,8 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * @neigh_addr: the address of the neighbour * * Looks for and possibly returns a neighbour belonging to this hard interface. - * Returns NULL if the neighbour is not found. + * + * Return: neighbor when found. Othwerwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -617,7 +608,7 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr)) continue; - if (!atomic_inc_not_zero(&tmp_hardif_neigh->refcount)) + if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount)) continue; hardif_neigh = tmp_hardif_neigh; @@ -635,7 +626,8 @@ batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, * @neigh_addr: the mac address of the neighbour interface * * Allocates a new neigh_node object and initialises all the generic fields. - * Returns the new object or NULL on failure. + * + * Return: neighbor when found. Othwerwise NULL */ struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, @@ -658,7 +650,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, if (!neigh_node) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) { + if (!kref_get_unless_zero(&hard_iface->refcount)) { kfree(neigh_node); neigh_node = NULL; goto out; @@ -673,14 +665,15 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, neigh_node->orig_node = orig_node; /* extra reference for return */ - atomic_set(&neigh_node->refcount, 2); + kref_init(&neigh_node->refcount); + kref_get(&neigh_node->refcount); spin_lock_bh(&orig_node->neigh_list_lock); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); spin_unlock_bh(&orig_node->neigh_list_lock); /* increment unique neighbor refcount */ - atomic_inc(&hardif_neigh->refcount); + kref_get(&hardif_neigh->refcount); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", @@ -688,7 +681,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, out: if (hardif_neigh) - batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } @@ -697,7 +690,7 @@ out: * @seq: neighbour table seq_file struct * @offset: not used * - * Always returns 0. + * Return: always 0 */ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) { @@ -714,7 +707,7 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name, bat_priv->bat_algo_ops->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (!bat_priv->bat_algo_ops->bat_neigh_print) { seq_puts(seq, @@ -727,57 +720,72 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) } /** - * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object - * @rcu: rcu pointer of the orig_ifinfo object + * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the orig_ifinfo */ -static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu) +static void batadv_orig_ifinfo_release(struct kref *ref) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; - orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu); + orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount); if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing); + batadv_hardif_put(orig_ifinfo->if_outgoing); /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); if (router) - batadv_neigh_node_free_ref_now(router); - kfree(orig_ifinfo); + batadv_neigh_node_put(router); + + kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free - * the orig_ifinfo (without rcu callback) + * batadv_orig_ifinfo_put - decrement the refcounter and possibly release + * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ -static void -batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo) +void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu); + kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free - * the orig_ifinfo - * @orig_ifinfo: the orig_ifinfo object to release + * batadv_orig_node_free_rcu - free the orig_node + * @rcu: rcu pointer of the orig_node */ -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) +static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu); + struct batadv_orig_node *orig_node; + + orig_node = container_of(rcu, struct batadv_orig_node, rcu); + + batadv_mcast_purge_orig(orig_node); + + batadv_frag_purge_orig(orig_node, NULL); + + if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) + orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); + + kfree(orig_node->tt_buff); + kfree(orig_node); } -static void batadv_orig_node_free_rcu(struct rcu_head *rcu) +/** + * batadv_orig_node_release - release orig_node from lists and queue for + * free after rcu grace period + * @ref: kref pointer of the orig_node + */ +static void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; - orig_node = container_of(rcu, struct batadv_orig_node, rcu); + orig_node = container_of(ref, struct batadv_orig_node, refcount); spin_lock_bh(&orig_node->neigh_list_lock); @@ -785,50 +793,30 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref_now(neigh_node); + batadv_neigh_node_put(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref_now(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); } spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_mcast_purge_orig(orig_node); - /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - batadv_frag_purge_orig(orig_node, NULL); - - if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) - orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); - - kfree(orig_node->tt_buff); - kfree(orig_node); + call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } /** - * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly - * schedule an rcu callback for freeing it + * batadv_orig_node_put - decrement the orig node refcounter and possibly + * release it * @orig_node: the orig node to free */ -void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) +void batadv_orig_node_put(struct batadv_orig_node *orig_node) { - if (atomic_dec_and_test(&orig_node->refcount)) - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); -} - -/** - * batadv_orig_node_free_ref_now - decrement the orig node refcounter and - * possibly free it (without rcu callback) - * @orig_node: the orig node to free - */ -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node) -{ - if (atomic_dec_and_test(&orig_node->refcount)) - batadv_orig_node_free_rcu(&orig_node->rcu); + kref_put(&orig_node->refcount, batadv_orig_node_release); } void batadv_originator_free(struct batadv_priv *bat_priv) @@ -855,7 +843,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(orig_node, node_tmp, head, hash_entry) { hlist_del_rcu(&orig_node->hash_entry); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } spin_unlock_bh(list_lock); } @@ -870,7 +858,8 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * * Creates a new originator object and initialise all the generic fields. * The new object is not added to the originator list. - * Returns the newly created object or NULL on failure. + * + * Return: the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr) @@ -899,7 +888,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, batadv_nc_init_orig(orig_node); /* extra reference for return */ - atomic_set(&orig_node->refcount, 2); + kref_init(&orig_node->refcount); + kref_get(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); @@ -927,7 +917,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, * Immediately release vlan since it is not needed anymore in this * context */ - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { INIT_HLIST_HEAD(&orig_node->fragments[i].head); @@ -976,7 +966,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, neigh->addr, if_outgoing->net_dev->name); hlist_del_rcu(&neigh_ifinfo->list); - batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); } spin_unlock_bh(&neigh->ifinfo_lock); @@ -987,7 +977,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * - * Returns true if any ifinfo entry was purged, false otherwise. + * Return: true if any ifinfo entry was purged, false otherwise. */ static bool batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, @@ -1022,10 +1012,10 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, ifinfo_purged = true; hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); if (orig_node->last_bonding_candidate == orig_ifinfo) { orig_node->last_bonding_candidate = NULL; - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); } } @@ -1039,7 +1029,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked * - * Returns true if any neighbor was purged, false otherwise + * Return: true if any neighbor was purged, false otherwise */ static bool batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, @@ -1079,7 +1069,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, neigh_purged = true; hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); } else { /* only necessary if not the whole neighbor is to be * deleted, but some interface has been removed. @@ -1098,7 +1088,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared * - * Returns the current best neighbor, with refcount increased. + * Return: the current best neighbor, with refcount increased. */ static struct batadv_neigh_node * batadv_find_best_neighbor(struct batadv_priv *bat_priv, @@ -1114,11 +1104,11 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, best, if_outgoing) <= 0)) continue; - if (!atomic_inc_not_zero(&neigh->refcount)) + if (!kref_get_unless_zero(&neigh->refcount)) continue; if (best) - batadv_neigh_node_free_ref(best); + batadv_neigh_node_put(best); best = neigh; } @@ -1135,7 +1125,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, * This function checks if the orig_node or substructures of it have become * obsolete, and purges this information if that's the case. * - * Returns true if the orig_node is to be removed, false otherwise. + * Return: true if the orig_node is to be removed, false otherwise. */ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) @@ -1164,7 +1154,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); if (best_neigh_node) - batadv_neigh_node_free_ref(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); @@ -1181,7 +1171,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); if (best_neigh_node) - batadv_neigh_node_free_ref(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); } rcu_read_unlock(); @@ -1214,7 +1204,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) batadv_tt_global_del_orig(orig_node->bat_priv, orig_node, -1, "originator timed out"); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); continue; } @@ -1260,7 +1250,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) primary_if->net_dev->dev_addr, net_dev->name, bat_priv->bat_algo_ops->name); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (!bat_priv->bat_algo_ops->bat_orig_print) { seq_puts(seq, @@ -1280,7 +1270,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) * @seq: debugfs table seq_file struct * @offset: not used * - * Returns 0 + * Return: 0 */ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) { @@ -1316,7 +1306,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset) out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return 0; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 29557753d552..4e8b67f11051 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -20,10 +20,10 @@ #include "main.h" -#include <linux/atomic.h> #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/kref.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/stddef.h> @@ -37,20 +37,19 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); -void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); +void batadv_orig_node_put(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void -batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh); +batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); +void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); @@ -60,7 +59,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo); +void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); @@ -70,7 +69,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); -void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo); +void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); @@ -84,7 +83,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); -void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); +void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan); /* hashfunction to choose an entry in a hash table of given size * hash algorithm from http://en.wikipedia.org/wiki/Hash_table @@ -116,7 +115,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data) if (!batadv_compare_eth(orig_node, data)) continue; - if (!atomic_inc_not_zero(&orig_node->refcount)) + if (!kref_get_unless_zero(&orig_node->refcount)) continue; orig_node_tmp = orig_node; diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 0558e3237e0e..e7f915181aba 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -158,7 +158,7 @@ enum batadv_tt_client_flags { }; /** - * batadv_vlan_flags - flags for the four MSB of any vlan ID field + * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not */ enum batadv_vlan_flags { @@ -209,6 +209,11 @@ struct batadv_bla_claim_dst { * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header * @flags: contains routing relevant flags - see enum batadv_iv_flags + * @seqno: sequence identification + * @orig: address of the source node + * @prev_sender: address of the previous sender + * @reserved: reserved byte for alignment + * @tq: transmission quality * @tvlv_len: length of tvlv data following the ogm header */ struct batadv_ogm_packet { @@ -230,7 +235,7 @@ struct batadv_ogm_packet { #define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet) /** - * batadv_icmp_header - common members among all the ICMP packets + * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -256,7 +261,7 @@ struct batadv_icmp_header { }; /** - * batadv_icmp_packet - ICMP packet + * struct batadv_icmp_packet - ICMP packet * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -282,7 +287,7 @@ struct batadv_icmp_packet { #define BATADV_RR_LEN 16 /** - * batadv_icmp_packet_rr - ICMP RouteRecord packet + * struct batadv_icmp_packet_rr - ICMP RouteRecord packet * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header @@ -345,6 +350,7 @@ struct batadv_unicast_packet { * @u: common unicast packet header * @src: address of the source * @subtype: packet subtype + * @reserved: reserved byte for alignment */ struct batadv_unicast_4addr_packet { struct batadv_unicast_packet u; @@ -413,7 +419,6 @@ struct batadv_bcast_packet { * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header - * @reserved: Align following fields to 2-byte boundaries * @first_source: original source of first included packet * @first_orig_dest: original destinal of first included packet * @first_crc: checksum of first included packet @@ -495,7 +500,7 @@ struct batadv_tvlv_gateway_data { * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container * @flags: translation table flags (see batadv_tt_data_flags) * @ttvn: translation table version number - * @vlan_num: number of announced VLANs. In the TVLV this struct is followed by + * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by * one batadv_tvlv_tt_vlan_data object per announced vlan */ struct batadv_tvlv_tt_data { diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index e4f2646d9246..4dd646a52f1a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -25,6 +25,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/jiffies.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> @@ -72,7 +73,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, rcu_read_lock(); curr_router = rcu_dereference(orig_ifinfo->router); - if (curr_router && !atomic_inc_not_zero(&curr_router->refcount)) + if (curr_router && !kref_get_unless_zero(&curr_router->refcount)) curr_router = NULL; rcu_read_unlock(); @@ -97,20 +98,20 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, } if (curr_router) - batadv_neigh_node_free_ref(curr_router); + batadv_neigh_node_put(curr_router); /* increase refcount of new best neighbor */ - if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount)) + if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount)) neigh_node = NULL; spin_lock_bh(&orig_node->neigh_list_lock); rcu_assign_pointer(orig_ifinfo->router, neigh_node); spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); /* decrease refcount of previous best neighbor */ if (curr_router) - batadv_neigh_node_free_ref(curr_router); + batadv_neigh_node_put(curr_router); } /** @@ -137,24 +138,38 @@ void batadv_update_route(struct batadv_priv *bat_priv, out: if (router) - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); } -/* checks whether the host restarted and is in the protection time. - * returns: - * 0 if the packet is to be accepted +/** + * batadv_window_protected - checks whether the host restarted and is in the + * protection time. + * @bat_priv: the bat priv with all the soft interface information + * @seq_num_diff: difference between the current/received sequence number and + * the last sequence number + * @seq_old_max_diff: maximum age of sequence number not considered as restart + * @last_reset: jiffies timestamp of the last reset, will be updated when reset + * is detected + * @protection_started: is set to true if the protection window was started, + * doesn't change otherwise. + * + * Return: + * 0 if the packet is to be accepted. * 1 if the packet is to be ignored. */ int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - unsigned long *last_reset) + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started) { - if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE || + if (seq_num_diff <= -seq_old_max_diff || seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) { if (!batadv_has_timed_out(*last_reset, BATADV_RESET_PROTECTION_MS)) return 1; *last_reset = jiffies; + if (protection_started) + *protection_started = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "old packet received, start protection\n"); } @@ -198,7 +213,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, * @bat_priv: the bat priv with all the soft interface information * @skb: icmp packet to process * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, @@ -254,9 +269,9 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -302,9 +317,9 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -388,7 +403,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -398,10 +413,11 @@ out: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. Returns negative - * value when check fails and 0 otherwise. The negative value depends on the - * reason: -ENODATA for bad header, -EBADR for broadcast destination or source, - * and -EREMOTE for non-local (other host) destination. + * Check for short header and bad addresses in given packet. + * + * Return: negative value when check fails and 0 otherwise. The negative value + * depends on the reason: -ENODATA for bad header, -EBADR for broadcast + * destination or source, and -EREMOTE for non-local (other host) destination. */ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -435,7 +451,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, * @orig_node: the destination node * @recv_if: pointer to interface this packet was received on * - * Returns the router which should be used for this orig_node on + * Return: the router which should be used for this orig_node on * this interface, or NULL if not available. */ struct batadv_neigh_node * @@ -482,14 +498,14 @@ batadv_find_router(struct batadv_priv *bat_priv, hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) { /* acquire some structures and references ... */ - if (!atomic_inc_not_zero(&cand->refcount)) + if (!kref_get_unless_zero(&cand->refcount)) continue; cand_router = rcu_dereference(cand->router); if (!cand_router) goto next; - if (!atomic_inc_not_zero(&cand_router->refcount)) { + if (!kref_get_unless_zero(&cand_router->refcount)) { cand_router = NULL; goto next; } @@ -508,8 +524,8 @@ batadv_find_router(struct batadv_priv *bat_priv, /* mark the first possible candidate */ if (!first_candidate) { - atomic_inc(&cand_router->refcount); - atomic_inc(&cand->refcount); + kref_get(&cand_router->refcount); + kref_get(&cand->refcount); first_candidate = cand; first_candidate_router = cand_router; } @@ -529,16 +545,16 @@ batadv_find_router(struct batadv_priv *bat_priv, next: /* free references */ if (cand_router) { - batadv_neigh_node_free_ref(cand_router); + batadv_neigh_node_put(cand_router); cand_router = NULL; } - batadv_orig_ifinfo_free_ref(cand); + batadv_orig_ifinfo_put(cand); } rcu_read_unlock(); /* last_bonding_candidate is reset below, remove the old reference. */ if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate); + batadv_orig_ifinfo_put(orig_node->last_bonding_candidate); /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that @@ -546,17 +562,17 @@ next: * 3) there is no candidate at all, return the default router */ if (next_candidate) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); /* remove references to first candidate, we don't need it. */ if (first_candidate) { - batadv_neigh_node_free_ref(first_candidate_router); - batadv_orig_ifinfo_free_ref(first_candidate); + batadv_neigh_node_put(first_candidate_router); + batadv_orig_ifinfo_put(first_candidate); } router = next_candidate_router; orig_node->last_bonding_candidate = next_candidate; } else if (first_candidate) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); /* refcounting has already been done in the loop above. */ router = first_candidate_router; @@ -633,7 +649,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -648,7 +664,7 @@ out: * the new corresponding information (originator address where the destination * client currently is and its known TTVN) * - * Returns true if the packet header has been updated, false otherwise + * Return: true if the packet header has been updated, false otherwise */ static bool batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, @@ -686,9 +702,9 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, ret = true; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -752,7 +768,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, return 0; curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } /* check if the TTVN contained in the packet is fresher than what the @@ -792,7 +808,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); unicast_packet->ttvn = curr_ttvn; @@ -805,7 +821,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, @@ -892,7 +908,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, rx_success: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -904,9 +920,8 @@ rx_success: * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets * @skb: unicast tvlv packet to process * @recv_if: pointer to interface this packet was received on - * @dst_addr: the payload destination * - * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP + * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP * otherwise. */ int batadv_recv_unicast_tvlv(struct sk_buff *skb, @@ -960,7 +975,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till * lack further fragments; 3) Merge fragments, if we have all needed parts. * - * Return NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. + * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. */ int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) @@ -1004,7 +1019,7 @@ int batadv_recv_frag_packet(struct sk_buff *skb, out: if (orig_node_src) - batadv_orig_node_free_ref(orig_node_src); + batadv_orig_node_put(orig_node_src); return ret; } @@ -1065,7 +1080,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, /* check whether the packet is old and the host just restarted. */ if (batadv_window_protected(bat_priv, seq_diff, - &orig_node->bcast_seqno_reset)) + BATADV_BCAST_MAX_AGE, + &orig_node->bcast_seqno_reset, NULL)) goto spin_unlock; /* mark broadcast in flood history, update window position @@ -1108,6 +1124,6 @@ spin_unlock: spin_unlock_bh(&orig_node->bcast_seqno_lock); out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return ret; } diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 204bbe4952a6..02a5caa84127 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -52,6 +52,7 @@ batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - unsigned long *last_reset); + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 782fa33ec296..caff32cf6fe7 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -111,7 +111,7 @@ send_skb_err: * host, NULL can be passed as recv_if and no interface alternating is * attempted. * - * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or + * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or * NET_XMIT_POLICED if the skb is buffered for later transmit. */ int batadv_send_skb_to_orig(struct sk_buff *skb, @@ -153,7 +153,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, out: if (neigh_node) - batadv_neigh_node_free_ref(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -165,7 +165,7 @@ out: * @hdr_size: amount of bytes to push at the beginning of the skb * @orig_node: the destination node * - * Returns false if the buffer extension was not possible or true otherwise. + * Return: false if the buffer extension was not possible or true otherwise. */ static bool batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, @@ -196,7 +196,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * - * Returns false if the payload could not be encapsulated or true otherwise. + * Return: false if the payload could not be encapsulated or true otherwise. */ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, struct batadv_orig_node *orig_node) @@ -211,10 +211,10 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, * unicast 4addr header * @bat_priv: the bat priv with all the soft interface information * @skb: the skb containing the payload to encapsulate - * @orig_node: the destination node + * @orig: the destination node * @packet_subtype: the unicast 4addr packet subtype to use * - * Returns false if the payload could not be encapsulated or true otherwise. + * Return: false if the payload could not be encapsulated or true otherwise. */ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, @@ -246,7 +246,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, ret = true; out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return ret; } @@ -265,7 +265,7 @@ out: * as packet_type. Then send this frame to the given orig_node and release a * reference to this orig_node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, @@ -317,7 +317,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); if (ret == NET_XMIT_DROP) kfree_skb(skb); return ret; @@ -339,7 +339,7 @@ out: * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame * to the according destination node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, @@ -373,7 +373,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, * Look up the currently selected gateway. Wrap the given skb into a batman-adv * unicast header and send this frame to this gateway node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) @@ -409,9 +409,9 @@ static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) - batadv_hardif_free_ref(forw_packet->if_incoming); + batadv_hardif_put(forw_packet->if_incoming); if (forw_packet->if_outgoing) - batadv_hardif_free_ref(forw_packet->if_outgoing); + batadv_hardif_put(forw_packet->if_outgoing); kfree(forw_packet); } @@ -430,14 +430,19 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, send_time); } -/* add a broadcast packet to the queue and setup timers. broadcast packets - * are sent multiple times to increase probability for being received. +/** + * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending * - * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on - * errors. + * add a broadcast packet to the queue and setup timers. broadcast packets + * are sent multiple times to increase probability for being received. * * The skb is not consumed, so the caller should make sure that the * skb is freed. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, @@ -492,7 +497,7 @@ out_and_inc: atomic_inc(&bat_priv->bcast_queue_left); out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NETDEV_TX_BUSY; } diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 82059f259e46..7ff95cada2e7 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -69,7 +69,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * header via the translation table. Wrap the given skb into a batman-adv * unicast header. Then send this frame to the according destination node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *dst_hint, @@ -92,7 +92,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, * unicast-4addr header. Then send this frame to the according destination * node. * - * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. + * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index ac4d08de5df4..0710379491bf 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -30,6 +30,7 @@ #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -376,7 +377,7 @@ dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return NETDEV_TX_OK; } @@ -478,22 +479,34 @@ out: } /** - * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and - * possibly free it - * @softif_vlan: the vlan object to release + * batadv_softif_vlan_release - release vlan from lists and queue for free after + * rcu grace period + * @ref: kref pointer of the vlan object */ -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) +static void batadv_softif_vlan_release(struct kref *ref) +{ + struct batadv_softif_vlan *vlan; + + vlan = container_of(ref, struct batadv_softif_vlan, refcount); + + spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); + hlist_del_rcu(&vlan->list); + spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); + + kfree_rcu(vlan, rcu); +} + +/** + * batadv_softif_vlan_put - decrease the vlan object refcounter and + * possibly release it + * @vlan: the vlan object to release + */ +void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; - if (atomic_dec_and_test(&vlan->refcount)) { - spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); - hlist_del_rcu(&vlan->list); - spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); - - kfree_rcu(vlan, rcu); - } + kref_put(&vlan->refcount, batadv_softif_vlan_release); } /** @@ -501,7 +514,7 @@ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * - * Returns the private data of the vlan matching the vid passed as argument or + * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, @@ -514,7 +527,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, if (vlan_tmp->vid != vid) continue; - if (!atomic_inc_not_zero(&vlan_tmp->refcount)) + if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; @@ -530,7 +543,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * - * Returns 0 on success, a negative error otherwise. + * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { @@ -539,7 +552,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return -EEXIST; } @@ -549,7 +562,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) vlan->bat_priv = bat_priv; vlan->vid = vid; - atomic_set(&vlan->refcount, 1); + kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); @@ -588,18 +601,19 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, vlan->vid, "vlan interface destroyed", false); batadv_sysfs_del_vlan(bat_priv, vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid - ndo_add_vid API implementation * @dev: the netdev of the mesh interface + * @proto: protocol of the the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * - * Returns 0 on success or a negative error code in case of failure. + * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) @@ -632,7 +646,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, if (!vlan->kobj) { ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); if (ret) { - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return ret; } } @@ -651,12 +665,13 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, /** * batadv_interface_kill_vid - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface + * @proto: protocol of the the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * - * Returns 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q + * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, @@ -678,7 +693,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); return 0; } @@ -734,7 +749,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } batadv_sysfs_del_meshif(soft_iface); @@ -745,7 +760,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work) * batadv_softif_init_late - late stage initialization of soft interface * @dev: registered network device to modify * - * Returns error code on failures + * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { @@ -847,7 +862,7 @@ free_bat_counters: * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * - * Return 0 if successful or error otherwise. + * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev) @@ -863,7 +878,7 @@ static int batadv_softif_slave_add(struct net_device *dev, out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -872,7 +887,7 @@ out: * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * - * Return 0 if successful or error otherwise. + * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) @@ -890,7 +905,7 @@ static int batadv_softif_slave_del(struct net_device *dev, out: if (hard_iface) - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 8e82176f40b1..9ae265703d23 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -34,7 +34,7 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface); int batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan); +void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fe87777fda8a..4d70d4413e40 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/if.h> #include <linux/if_vlan.h> +#include <linux/kref.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -64,7 +65,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj) * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv * @obj: kobject to covert * - * Returns the associated batadv_priv struct. + * Return: the associated batadv_priv struct. */ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) { @@ -82,9 +83,10 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj) /** * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct + * @bat_priv: the bat priv with all the soft interface information * @obj: kobject to covert * - * Returns the associated softif_vlan struct if found, NULL otherwise. + * Return: the associated softif_vlan struct if found, NULL otherwise. */ static struct batadv_softif_vlan * batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) @@ -96,7 +98,7 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) if (vlan_tmp->kobj != obj) continue; - if (!atomic_inc_not_zero(&vlan_tmp->refcount)) + if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; @@ -214,7 +216,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \ attr, &vlan->_name, \ bat_priv->soft_iface); \ \ - batadv_softif_vlan_free_ref(vlan); \ + batadv_softif_vlan_put(vlan); \ return res; \ } @@ -229,7 +231,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ atomic_read(&vlan->_name) == 0 ? \ "disabled" : "enabled"); \ \ - batadv_softif_vlan_free_ref(vlan); \ + batadv_softif_vlan_put(vlan); \ return res; \ } @@ -491,7 +493,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, * @attr: the batman-adv attribute the user is interacting with * @buff: the buffer that will contain the data to send back to the user * - * Returns the number of bytes written into 'buff' on success or a negative + * Return: the number of bytes written into 'buff' on success or a negative * error code in case of failure */ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, @@ -511,7 +513,7 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, * @buff: the buffer containing the user data * @count: number of bytes in the buffer * - * Returns 'count' on success or a negative error code in case of failure + * Return: 'count' on success or a negative error code in case of failure */ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, struct attribute *attr, char *buff, @@ -620,9 +622,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = { BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL); -/** - * batadv_vlan_attrs - array of vlan specific sysfs attributes - */ +/* array of vlan specific sysfs attributes */ static struct batadv_attribute *batadv_vlan_attrs[] = { &batadv_attr_vlan_ap_isolation, NULL, @@ -683,7 +683,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev) * @dev: netdev of the mesh interface * @vlan: private data of the newly added VLAN interface * - * Returns 0 on success and -ENOMEM if any of the structure allocations fails. + * Return: 0 on success and -ENOMEM if any of the structure allocations fails. */ int batadv_sysfs_add_vlan(struct net_device *dev, struct batadv_softif_vlan *vlan) @@ -771,7 +771,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, length = sprintf(buff, "%s\n", ifname); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return length; } @@ -795,7 +795,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, if (strlen(buff) >= IFNAMSIZ) { pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", buff); - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return -EINVAL; } @@ -829,7 +829,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, unlock: rtnl_unlock(); out: - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -863,7 +863,7 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj, break; } - batadv_hardif_free_ref(hard_iface); + batadv_hardif_put(hard_iface); return length; } diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 61974428a7af..c76021b4e198 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a22080c53401..0b43e86328a5 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -31,6 +31,7 @@ #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -68,7 +69,15 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, unsigned short vid, const char *message, bool roaming); -/* returns 1 if they are the same mac addr and vid */ +/** + * batadv_compare_tt - check if two TT entries are the same + * @node: the list element pointer of the first TT entry + * @data2: pointer to the tt_common_entry of the second TT entry + * + * Compare the MAC address and the VLAN ID of the two TT entries and check if + * they are the same TT client. + * Return: 1 if the two TT clients are the same, 0 otherwise + */ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, @@ -84,7 +93,7 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) * @data: pointer to the tt_common_entry object to map * @size: the size of the hash table * - * Returns the hash index where the object represented by 'data' should be + * Return: the hash index where the object represented by 'data' should be * stored at. */ static inline u32 batadv_choose_tt(const void *data, u32 size) @@ -105,7 +114,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size) * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the tt_common struct belonging to the searched client if + * Return: a pointer to the tt_common struct belonging to the searched client if * found, NULL otherwise. */ static struct batadv_tt_common_entry * @@ -133,7 +142,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, if (tt->vid != vid) continue; - if (!atomic_inc_not_zero(&tt->refcount)) + if (!kref_get_unless_zero(&tt->refcount)) continue; tt_tmp = tt; @@ -150,7 +159,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the corresponding tt_local_entry struct if the client is + * Return: a pointer to the corresponding tt_local_entry struct if the client is * found, NULL otherwise. */ static struct batadv_tt_local_entry * @@ -175,7 +184,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, * @addr: the mac address of the client to look for * @vid: VLAN identifier * - * Returns a pointer to the corresponding tt_global_entry struct if the client + * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ static struct batadv_tt_global_entry * @@ -194,34 +203,68 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, return tt_global_entry; } +/** + * batadv_tt_local_entry_release - release tt_local_entry from lists and queue + * for free after rcu grace period + * @ref: kref pointer of the nc_node + */ +static void batadv_tt_local_entry_release(struct kref *ref) +{ + struct batadv_tt_local_entry *tt_local_entry; + + tt_local_entry = container_of(ref, struct batadv_tt_local_entry, + common.refcount); + + kfree_rcu(tt_local_entry, common.rcu); +} + +/** + * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and + * possibly release it + * @tt_local_entry: tt_local_entry to be free'd + */ static void -batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry) +batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) +{ + kref_put(&tt_local_entry->common.refcount, + batadv_tt_local_entry_release); +} + +/** + * batadv_tt_global_entry_release - release tt_global_entry from lists and queue + * for free after rcu grace period + * @ref: kref pointer of the nc_node + */ +static void batadv_tt_global_entry_release(struct kref *ref) { - if (atomic_dec_and_test(&tt_local_entry->common.refcount)) - kfree_rcu(tt_local_entry, common.rcu); + struct batadv_tt_global_entry *tt_global_entry; + + tt_global_entry = container_of(ref, struct batadv_tt_global_entry, + common.refcount); + + batadv_tt_global_del_orig_list(tt_global_entry); + kfree_rcu(tt_global_entry, common.rcu); } /** - * batadv_tt_global_entry_free_ref - decrement the refcounter for a - * tt_global_entry and possibly free it - * @tt_global_entry: the object to free + * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and + * possibly release it + * @tt_global_entry: tt_global_entry to be free'd */ static void -batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) +batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { - if (atomic_dec_and_test(&tt_global_entry->common.refcount)) { - batadv_tt_global_del_orig_list(tt_global_entry); - kfree_rcu(tt_global_entry, common.rcu); - } + kref_put(&tt_global_entry->common.refcount, + batadv_tt_global_entry_release); } /** * batadv_tt_global_hash_count - count the number of orig entries - * @hash: hash table containing the tt entries + * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * - * Return the number of originators advertising the given address/data + * Return: the number of originators advertising the given address/data * (excluding ourself). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, @@ -235,25 +278,11 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, return 0; count = atomic_read(&tt_global_entry->orig_list_count); - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); return count; } -static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_orig_list_entry *orig_entry; - - orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - - /* We are in an rcu callback here, therefore we cannot use - * batadv_orig_node_free_ref() and its call_rcu(): - * An rcu_barrier() wouldn't wait for that to finish - */ - batadv_orig_node_free_ref_now(orig_entry->orig_node); - kfree(orig_entry); -} - /** * batadv_tt_local_size_mod - change the size by v of the local table identified * by vid @@ -272,7 +301,7 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, atomic_add(v, &vlan->tt.num_entries); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } /** @@ -300,9 +329,9 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, } /** - * batadv_tt_global_size_mod - change the size by v of the local table - * identified by vid - * @bat_priv: the bat priv with all the soft interface information + * batadv_tt_global_size_mod - change the size by v of the global table + * for orig_node identified by vid + * @orig_node: the originator for which the table has to be modified * @vid: the VLAN identifier * @v: the amount to sum to the global table size */ @@ -317,12 +346,14 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - hlist_del_init_rcu(&vlan->list); + if (!hlist_unhashed(&vlan->list)) { + hlist_del_init_rcu(&vlan->list); + batadv_orig_node_vlan_put(vlan); + } spin_unlock_bh(&orig_node->vlan_list_lock); - batadv_orig_node_vlan_free_ref(vlan); } - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } /** @@ -349,13 +380,31 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, batadv_tt_global_size_mod(orig_node, vid, -1); } -static void -batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) +/** + * batadv_tt_orig_list_entry_release - release tt orig entry from lists and + * queue for free after rcu grace period + * @ref: kref pointer of the tt orig entry + */ +static void batadv_tt_orig_list_entry_release(struct kref *ref) { - if (!atomic_dec_and_test(&orig_entry->refcount)) - return; + struct batadv_tt_orig_list_entry *orig_entry; + + orig_entry = container_of(ref, struct batadv_tt_orig_list_entry, + refcount); - call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); + batadv_orig_node_put(orig_entry->orig_node); + kfree_rcu(orig_entry, rcu); +} + +/** + * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and + * possibly release it + * @orig_entry: tt orig entry to be free'd + */ +static void +batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) +{ + kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } /** @@ -437,7 +486,7 @@ unlock: * batadv_tt_len - compute length in bytes of given number of tt changes * @changes_num: number of tt changes * - * Returns computed length in bytes. + * Return: computed length in bytes. */ static int batadv_tt_len(int changes_num) { @@ -448,7 +497,7 @@ static int batadv_tt_len(int changes_num) * batadv_tt_entries - compute the number of entries fitting in tt_len bytes * @tt_len: available space * - * Returns the number of entries. + * Return: the number of entries. */ static u16 batadv_tt_entries(u16 tt_len) { @@ -460,7 +509,7 @@ static u16 batadv_tt_entries(u16 tt_len) * size when transmitted over the air * @bat_priv: the bat priv with all the soft interface information * - * Returns local translation table size in bytes. + * Return: local translation table size in bytes. */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { @@ -512,7 +561,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } /** @@ -526,7 +575,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * @mark: the value contained in the skb->mark field of the received packet (if * any) * - * Returns true if the client was successfully added, false otherwise. + * Return: true if the client was successfully added, false otherwise. */ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) @@ -620,7 +669,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, tt_local->common.vid = vid; if (batadv_is_wifi_netdev(in_dev)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; - atomic_set(&tt_local->common.refcount, 2); + kref_init(&tt_local->common.refcount); + kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; @@ -637,8 +687,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_tt_local_entry_free_ref(tt_local); - batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_put(tt_local); + batadv_softif_vlan_put(vlan); goto out; } @@ -704,9 +754,9 @@ out: if (in_dev) dev_put(in_dev); if (tt_local) - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); if (tt_global) - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); return ret; } @@ -721,12 +771,11 @@ out: * function reserves the amount of space needed to send the entire global TT * table. In case of success the value is updated with the real amount of * reserved bytes - * Allocate the needed amount of memory for the entire TT TVLV and write its * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * - * Return the size of the allocated buffer or 0 in case of failure. + * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, @@ -800,7 +849,7 @@ out: * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN. * - * Return the size of the allocated buffer or 0 in case of failure. + * Return: the size of the allocated buffer or 0 in case of failure. */ static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, @@ -1005,13 +1054,13 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) no_purge ? 0 : last_seen_msecs, vlan->tt.crc); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); } rcu_read_unlock(); } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1042,7 +1091,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, * @message: message to append to the log on deletion * @roaming: true if the deletion is due to a roaming event * - * Returns the flags assigned to the local entry before being deleted + * Return: the flags assigned to the local entry before being deleted */ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, @@ -1088,19 +1137,19 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, goto out; /* extra call to free the local tt entry */ - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) goto out; - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); + batadv_softif_vlan_put(vlan); out: if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } @@ -1196,11 +1245,11 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) vlan = batadv_softif_vlan_get(bat_priv, tt_common_entry->vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); + batadv_softif_vlan_put(vlan); } - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } @@ -1242,10 +1291,16 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.changes_list_lock); } -/* retrieves the orig_tt_list_entry belonging to orig_node from the +/** + * batadv_tt_global_orig_entry_find - find a TT orig_list_entry + * @entry: the TT global entry where the orig_list_entry has to be + * extracted from + * @orig_node: the originator for which the orig_list_entry has to be found + * + * retrieve the orig_tt_list_entry belonging to orig_node from the * batadv_tt_global_entry list * - * returns it with an increased refcounter, NULL if not found + * Return: it with an increased refcounter, NULL if not found */ static struct batadv_tt_orig_list_entry * batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, @@ -1259,7 +1314,7 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, hlist_for_each_entry_rcu(tmp_orig_entry, head, list) { if (tmp_orig_entry->orig_node != orig_node) continue; - if (!atomic_inc_not_zero(&tmp_orig_entry->refcount)) + if (!kref_get_unless_zero(&tmp_orig_entry->refcount)) continue; orig_entry = tmp_orig_entry; @@ -1270,8 +1325,15 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry, return orig_entry; } -/* find out if an orig_node is already in the list of a tt_global_entry. - * returns true if found, false otherwise +/** + * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled + * by a given originator + * @entry: the TT global entry to check + * @orig_node: the originator to search in the list + * + * find out if an orig_node is already in the list of a tt_global_entry. + * + * Return: true if found, false otherwise */ static bool batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, @@ -1283,7 +1345,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node); if (orig_entry) { found = true; - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } return found; @@ -1309,11 +1371,12 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, goto out; INIT_HLIST_NODE(&orig_entry->list); - atomic_inc(&orig_node->refcount); + kref_get(&orig_node->refcount); batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; - atomic_set(&orig_entry->refcount, 2); + kref_init(&orig_entry->refcount); + kref_get(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); hlist_add_head_rcu(&orig_entry->list, @@ -1323,7 +1386,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, out: if (orig_entry) - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } /** @@ -1343,7 +1406,7 @@ out: * * The caller must hold orig_node refcount. * - * Return true if the new entry has been added, false otherwise + * Return: true if the new entry has been added, false otherwise */ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -1389,7 +1452,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, */ if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; - atomic_set(&common->refcount, 2); + kref_init(&common->refcount); + kref_get(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); @@ -1403,7 +1467,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); goto out_remove; } } else { @@ -1489,9 +1553,9 @@ out_remove: out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -1501,7 +1565,7 @@ out: * @tt_global_entry: global translation table entry to be analyzed * * This functon assumes the caller holds rcu_read_lock(). - * Returns best originator list entry or NULL on errors. + * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * batadv_transtable_best_orig(struct batadv_priv *bat_priv, @@ -1522,20 +1586,20 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, if (best_router && bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT, best_router, BATADV_IF_DEFAULT) <= 0) { - batadv_neigh_node_free_ref(router); + batadv_neigh_node_put(router); continue; } /* release the refcount for the "old" best */ if (best_router) - batadv_neigh_node_free_ref(best_router); + batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } if (best_router) - batadv_neigh_node_free_ref(best_router); + batadv_neigh_node_put(best_router); return best_entry; } @@ -1588,7 +1652,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } print_list: @@ -1620,7 +1684,7 @@ print_list: ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); } } @@ -1661,7 +1725,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) } out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); return 0; } @@ -1689,7 +1753,7 @@ _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, * being part of a list */ hlist_del_rcu(&orig_entry->list); - batadv_tt_orig_list_entry_free_ref(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); } /* deletes the orig list of a tt_global_entry */ @@ -1845,9 +1909,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (local_entry) - batadv_tt_local_entry_free_ref(local_entry); + batadv_tt_local_entry_put(local_entry); } /** @@ -1901,7 +1965,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, tt_global->common.addr, BATADV_PRINT_VID(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } } spin_unlock_bh(list_lock); @@ -1964,7 +2028,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) hlist_del_rcu(&tt_common->hash_entry); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } @@ -1996,7 +2060,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) tt_global = container_of(tt_common_entry, struct batadv_tt_global_entry, common); - batadv_tt_global_entry_free_ref(tt_global); + batadv_tt_global_entry_put(tt_global); } spin_unlock_bh(list_lock); } @@ -2031,7 +2095,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, * @addr: mac address of the destination client * @vid: VLAN identifier * - * Returns a pointer to the originator that was selected as destination in the + * Return: a pointer to the originator that was selected as destination in the * mesh for contacting the client 'addr', NULL otherwise. * In case of multiple originators serving the same client, the function returns * the best one (best in terms of metric towards the destination node). @@ -2071,15 +2135,15 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, /* found anything? */ if (best_entry) orig_node = best_entry->orig_node; - if (orig_node && !atomic_inc_not_zero(&orig_node->refcount)) + if (orig_node && !kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; rcu_read_unlock(); out: if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return orig_node; } @@ -2106,7 +2170,7 @@ out: * because the XOR operation can combine them all while trying to reduce the * noise as much as possible. * - * Returns the checksum of the global table of a given originator. + * Return: the checksum of the global table of a given originator. */ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -2183,7 +2247,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, * For details about the computation, please refer to the documentation for * batadv_tt_global_crc(). * - * Returns the checksum of the local table + * Return: the checksum of the local table */ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, unsigned short vid) @@ -2289,7 +2353,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node this request is being issued for * - * Returns the pointer to the new tt_req_node struct if no request + * Return: the pointer to the new tt_req_node struct if no request * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * @@ -2324,7 +2388,7 @@ unlock: * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * - * Returns 1 if the entry is a valid, 0 otherwise. + * Return: 1 if the entry is a valid, 0 otherwise. */ static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) { @@ -2408,9 +2472,8 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, * @orig_node: originator for which the CRCs have to be checked * @tt_vlan: pointer to the first tvlv VLAN entry * @num_vlan: number of tvlv VLAN entries - * @create: if true, create VLAN objects if not found * - * Return true if all the received CRCs match the locally stored ones, false + * Return: true if all the received CRCs match the locally stored ones, false * otherwise */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, @@ -2440,7 +2503,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, return false; crc = vlan->tt.crc; - batadv_orig_node_vlan_free_ref(vlan); + batadv_orig_node_vlan_put(vlan); if (crc != ntohl(tt_vlan_tmp->crc)) return false; @@ -2513,6 +2576,8 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, * @num_vlan: number of tvlv VLAN entries * @full_table: ask for the entire translation table if true, while only for the * last TT diff otherwise + * + * Return: true if the TT Request was sent, false otherwise */ static int batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, @@ -2573,7 +2638,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); /* hlist_del_init() verifies tt_req_node still is in the list */ @@ -2593,7 +2658,7 @@ out: * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2711,9 +2776,9 @@ unlock: out: if (res_dst_orig_node) - batadv_orig_node_free_ref(res_dst_orig_node); + batadv_orig_node_put(res_dst_orig_node); if (req_dst_orig_node) - batadv_orig_node_free_ref(req_dst_orig_node); + batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } @@ -2725,7 +2790,7 @@ out: * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2828,9 +2893,9 @@ unlock: out: spin_unlock_bh(&bat_priv->tt.commit_lock); if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; @@ -2843,7 +2908,7 @@ out: * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient * - * Returns true if tt request reply was sent, false otherwise. + * Return: true if tt request reply was sent, false otherwise. */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, @@ -2916,7 +2981,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, @@ -2938,7 +3003,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, * @addr: the mac address of the client to check * @vid: VLAN identifier * - * Returns true if the client is served by this node, false otherwise. + * Return: true if the client is served by this node, false otherwise. */ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -2958,7 +3023,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, ret = true; out: if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3022,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->tt.req_list_lock); out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) @@ -3055,11 +3120,16 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.roam_list_lock); } -/* This function checks whether the client already reached the +/** + * batadv_tt_check_roam_count - check if a client has roamed too frequently + * @bat_priv: the bat priv with all the soft interface information + * @client: mac address of the roaming client + * + * This function checks whether the client already reached the * maximum number of possible roaming phases. In this case the ROAMING_ADV * will not be sent. * - * returns true if the ROAMING_ADV can be sent, false otherwise + * Return: true if the ROAMING_ADV can be sent, false otherwise */ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { @@ -3148,7 +3218,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, out: if (primary_if) - batadv_hardif_free_ref(primary_if); + batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) @@ -3272,11 +3342,11 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) /* decrease the reference held for this vlan */ vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); if (vlan) { - batadv_softif_vlan_free_ref(vlan); - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); + batadv_softif_vlan_put(vlan); } - batadv_tt_local_entry_free_ref(tt_local); + batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); } @@ -3359,11 +3429,11 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, ret = true; out: - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_put(vlan); if (tt_global_entry) - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); if (tt_local_entry) - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3371,13 +3441,12 @@ out: * batadv_tt_update_orig - update global translation table with new tt * information received via ogms * @bat_priv: the bat priv with all the soft interface information - * @orig: the orig_node of the ogm - * @tt_vlan: pointer to the first tvlv VLAN entry + * @orig_node: the orig_node of the ogm + * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries * @tt_change: pointer to the first entry in the TT buffer * @tt_num_changes: number of tt changes inside the tt buffer * @ttvn: translation table version number of this changeset - * @tt_crc: crc32 checksum of orig node's translation table */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -3459,7 +3528,7 @@ request_table: * @addr: the mac address of the client to check * @vid: VLAN identifier * - * Returns true if we know that the client has moved from its old originator + * Return: true if we know that the client has moved from its old originator * to another one. This entry is still kept for consistency purposes and will be * deleted later by a DEL or because of timeout */ @@ -3474,7 +3543,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, goto out; ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM; - batadv_tt_global_entry_free_ref(tt_global_entry); + batadv_tt_global_entry_put(tt_global_entry); out: return ret; } @@ -3485,7 +3554,7 @@ out: * @addr: the mac address of the local client to query * @vid: VLAN identifier * - * Returns true if the local client is known to be roaming (it is not served by + * Return: true if the local client is known to be roaming (it is not served by * this node anymore) or not. If yes, the client is still present in the table * to keep the latter consistent with the node TTVN */ @@ -3500,7 +3569,7 @@ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, goto out; ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM; - batadv_tt_local_entry_free_ref(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); out: return ret; } @@ -3614,7 +3683,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * - * Returns NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS + * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, @@ -3695,7 +3764,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, * @tvlv_value: tvlv buffer containing the tt data * @tvlv_value_len: tvlv buffer length * - * Returns NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS + * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, @@ -3733,7 +3802,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, out: if (orig_node) - batadv_orig_node_free_ref(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -3741,7 +3810,7 @@ out: * batadv_tt_init - initialise the translation table internals * @bat_priv: the bat priv with all the soft interface information * - * Return 0 on success or negative error number in case of failure. + * Return: 0 on success or negative error number in case of failure. */ int batadv_tt_init(struct batadv_priv *bat_priv) { @@ -3779,7 +3848,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * - * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false + * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, @@ -3794,7 +3863,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA; - batadv_tt_global_entry_free_ref(tt); + batadv_tt_global_entry_put(tt); return ret; } diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index abd8e116e5fb..7c7e2c006bfe 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 3437b667a2cd..612de23178e6 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -25,6 +25,7 @@ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/if_ether.h> +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/sched.h> /* for linux/wait.h */ #include <linux/spinlock.h> @@ -73,7 +74,7 @@ enum batadv_dhcp_recipient { #define BATADV_TT_SYNC_MASK 0x00F0 /** - * struct batadv_hard_iface_bat_iv - per hard interface B.A.T.M.A.N. IV data + * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data * @ogm_buff: buffer holding the OGM packet * @ogm_buff_len: length of the OGM packet buffer * @ogm_seqno: OGM sequence number - used to identify each OGM @@ -97,8 +98,8 @@ struct batadv_hard_iface_bat_iv { * batman-adv for this interface * @soft_iface: the batman-adv interface which uses this network interface * @rcu: struct used for freeing in an RCU-safe manner - * @bat_iv: BATMAN IV specific per hard interface data - * @cleanup_work: work queue callback item for hard interface deinit + * @bat_iv: per hard-interface B.A.T.M.A.N. IV data + * @cleanup_work: work queue callback item for hard-interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs * @neigh_list: list of unique single hop neighbors via this interface * @neigh_list_lock: lock protecting neigh_list @@ -110,7 +111,7 @@ struct batadv_hard_iface { struct net_device *net_dev; u8 num_bcasts; struct kobject *hardif_obj; - atomic_t refcount; + struct kref refcount; struct packet_type batman_adv_ptype; struct net_device *soft_iface; struct rcu_head rcu; @@ -125,7 +126,7 @@ struct batadv_hard_iface { /** * struct batadv_orig_ifinfo - originator info per outgoing interface * @list: list node for orig_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard interface + * @if_outgoing: pointer to outgoing hard-interface * @router: router that should be used to reach this originator * @last_real_seqno: last and best known sequence number * @last_ttl: ttl of last received packet @@ -140,7 +141,7 @@ struct batadv_orig_ifinfo { u32 last_real_seqno; u8 last_ttl; unsigned long batman_seqno_reset; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -196,13 +197,13 @@ struct batadv_orig_node_vlan { unsigned short vid; struct batadv_vlan_tt tt; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: set of bitfields (one per hard interface) where each one counts + * @bcast_own: set of bitfields (one per hard-interface) where each one counts * the number of our OGMs this orig_node rebroadcasted "back" to us (relative * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. * @bcast_own_sum: sum of bcast_own @@ -298,7 +299,7 @@ struct batadv_orig_node { struct batadv_priv *bat_priv; /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */ spinlock_t bcast_seqno_lock; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; #ifdef CONFIG_BATMAN_ADV_NC struct list_head in_coding_list; @@ -341,15 +342,16 @@ struct batadv_gw_node { struct batadv_orig_node *orig_node; u32 bandwidth_down; u32 bandwidth_up; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** - * batadv_hardif_neigh_node - unique neighbor per hard interface + * struct batadv_hardif_neigh_node - unique neighbor per hard-interface * @list: list node for batadv_hard_iface::neigh_list * @addr: the MAC address of the neighboring interface - * @if_incoming: pointer to incoming hard interface + * @if_incoming: pointer to incoming hard-interface + * @last_seen: when last packet via this neighbor was received * @refcount: number of contexts the object is used * @rcu: struct used for freeing in a RCU-safe manner */ @@ -358,7 +360,7 @@ struct batadv_hardif_neigh_node { u8 addr[ETH_ALEN]; struct batadv_hard_iface *if_incoming; unsigned long last_seen; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -369,7 +371,7 @@ struct batadv_hardif_neigh_node { * @addr: the MAC address of the neighboring interface * @ifinfo_list: list for routing metrics per outgoing interface * @ifinfo_lock: lock protecting private ifinfo members and list - * @if_incoming: pointer to incoming hard interface + * @if_incoming: pointer to incoming hard-interface * @last_seen: when last packet via this neighbor was received * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -382,13 +384,13 @@ struct batadv_neigh_node { spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; unsigned long last_seen; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; /** * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing - * interface for BATMAN IV + * interface for B.A.T.M.A.N. IV * @tq_recv: ring buffer of received TQ values from this neigh node * @tq_index: ring buffer index * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv) @@ -407,7 +409,7 @@ struct batadv_neigh_ifinfo_bat_iv { /** * struct batadv_neigh_ifinfo - neighbor information per outgoing interface * @list: list node for batadv_neigh_node::ifinfo_list - * @if_outgoing: pointer to outgoing hard interface + * @if_outgoing: pointer to outgoing hard-interface * @bat_iv: B.A.T.M.A.N. IV private structure * @last_ttl: last received ttl from this neigh node * @refcount: number of contexts the object is used @@ -418,7 +420,7 @@ struct batadv_neigh_ifinfo { struct batadv_hard_iface *if_outgoing; struct batadv_neigh_ifinfo_bat_iv bat_iv; u8 last_ttl; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -744,7 +746,7 @@ struct batadv_softif_vlan { atomic_t ap_isolation; /* boolean */ struct batadv_vlan_tt tt; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -771,6 +773,9 @@ struct batadv_softif_vlan { * @orig_interval: OGM broadcast interval in milliseconds * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop * @log_level: configured log level (see batadv_dbg_level) + * @isolation_mark: the skb->mark value used to match packets for AP isolation + * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used + * for the isolation mark * @bcast_seqno: last sent broadcast packet sequence number * @bcast_queue_left: number of remaining buffered broadcast packet slots * @batman_queue_left: number of remaining OGM packet slots @@ -783,8 +788,8 @@ struct batadv_softif_vlan { * @forw_bat_list_lock: lock protecting forw_bat_list * @forw_bcast_list_lock: lock protecting forw_bcast_list * @orig_work: work queue callback item for orig node purging - * @cleanup_work: work queue callback item for soft interface deinit - * @primary_if: one of the hard interfaces assigned to this mesh interface + * @cleanup_work: work queue callback item for soft-interface deinit + * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface * @bat_algo_ops: routing algorithm used by this mesh interface * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top @@ -925,7 +930,7 @@ struct batadv_bla_backbone_gw { atomic_t request_sent; u16 crc; spinlock_t crc_lock; /* protects crc */ - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -946,7 +951,7 @@ struct batadv_bla_claim { unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; - atomic_t refcount; + struct kref refcount; }; #endif @@ -967,7 +972,7 @@ struct batadv_tt_common_entry { struct hlist_node hash_entry; u16 flags; unsigned long added_at; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1009,7 +1014,7 @@ struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; struct hlist_node list; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1062,7 +1067,7 @@ struct batadv_tt_roam_node { struct batadv_nc_node { struct list_head list; u8 addr[ETH_ALEN]; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; struct batadv_orig_node *orig_node; unsigned long last_seen; @@ -1082,7 +1087,7 @@ struct batadv_nc_node { struct batadv_nc_path { struct hlist_node hash_entry; struct rcu_head rcu; - atomic_t refcount; + struct kref refcount; struct list_head packet_list; spinlock_t packet_list_lock; /* Protects packet_list */ u8 next_hop[ETH_ALEN]; @@ -1225,7 +1230,7 @@ struct batadv_dat_entry { unsigned short vid; unsigned long last_update; struct hlist_node hash_entry; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; @@ -1261,7 +1266,7 @@ struct batadv_dat_candidate { struct batadv_tvlv_container { struct hlist_node list; struct batadv_tvlv_hdr tvlv_hdr; - atomic_t refcount; + struct kref refcount; }; /** @@ -1288,7 +1293,7 @@ struct batadv_tvlv_handler { u8 type; u8 version; u8 flags; - atomic_t refcount; + struct kref refcount; struct rcu_head rcu; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index d040365ba98e..8a4cc2f7f0db 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -307,6 +307,9 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, /* check that it's our buffer */ if (lowpan_is_ipv6(*skb_network_header(skb))) { + /* Pull off the 1-byte of 6lowpan header. */ + skb_pull(skb, 1); + /* Copy the packet so that the IPv6 header is * properly aligned. */ @@ -317,6 +320,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; + local_skb->dev = dev; skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); @@ -335,6 +339,8 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, if (!local_skb) goto drop; + local_skb->dev = dev; + ret = iphc_decompress(local_skb, dev, chan); if (ret < 0) { kfree_skb(local_skb); @@ -343,7 +349,6 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; - local_skb->dev = dev; if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 47bcef754796..883c821a9e78 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4112,8 +4112,10 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, break; } - *req_complete = bt_cb(skb)->hci.req_complete; - *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) + *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + else + *req_complete = bt_cb(skb)->hci.req_complete; kfree_skb(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 41b5f3813f02..c78ee2dc9323 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -688,21 +688,29 @@ static u8 update_white_list(struct hci_request *req) * command to remove it from the controller. */ list_for_each_entry(b, &hdev->le_white_list, list) { - struct hci_cp_le_del_from_white_list cp; + /* If the device is neither in pend_le_conns nor + * pend_le_reports then remove it from the whitelist. + */ + if (!hci_pend_le_action_lookup(&hdev->pend_le_conns, + &b->bdaddr, b->bdaddr_type) && + !hci_pend_le_action_lookup(&hdev->pend_le_reports, + &b->bdaddr, b->bdaddr_type)) { + struct hci_cp_le_del_from_white_list cp; + + cp.bdaddr_type = b->bdaddr_type; + bacpy(&cp.bdaddr, &b->bdaddr); - if (hci_pend_le_action_lookup(&hdev->pend_le_conns, - &b->bdaddr, b->bdaddr_type) || - hci_pend_le_action_lookup(&hdev->pend_le_reports, - &b->bdaddr, b->bdaddr_type)) { - white_list_entries++; + hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, + sizeof(cp), &cp); continue; } - cp.bdaddr_type = b->bdaddr_type; - bacpy(&cp.bdaddr, &b->bdaddr); + if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { + /* White list can not be used with RPAs */ + return 0x00; + } - hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, - sizeof(cp), &cp); + white_list_entries++; } /* Since all no longer valid white list entries have been diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 39a5149f3010..eb4f5f24cbe3 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -197,10 +197,20 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) chan->sport = psm; err = 0; } else { - u16 p; + u16 p, start, end, incr; + + if (chan->src_type == BDADDR_BREDR) { + start = L2CAP_PSM_DYN_START; + end = L2CAP_PSM_AUTO_END; + incr = 2; + } else { + start = L2CAP_PSM_LE_DYN_START; + end = L2CAP_PSM_LE_DYN_END; + incr = 1; + } err = -EINVAL; - for (p = 0x1001; p < 0x1100; p += 2) + for (p = start; p <= end; p += incr) if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) { chan->psm = cpu_to_le16(p); chan->sport = cpu_to_le16(p); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 1bb551527044..e4cae72895a7 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -58,7 +58,7 @@ static int l2cap_validate_bredr_psm(u16 psm) return -EINVAL; /* Restrict usage of well-known PSMs */ - if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) + if (psm < L2CAP_PSM_DYN_START && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; return 0; @@ -67,11 +67,11 @@ static int l2cap_validate_bredr_psm(u16 psm) static int l2cap_validate_le_psm(u16 psm) { /* Valid LE_PSM ranges are defined only until 0x00ff */ - if (psm > 0x00ff) + if (psm > L2CAP_PSM_LE_DYN_END) return -EINVAL; /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */ - if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE)) + if (psm < L2CAP_PSM_LE_DYN_START && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; return 0; @@ -125,6 +125,9 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) goto done; } + bacpy(&chan->src, &la.l2_bdaddr); + chan->src_type = la.l2_bdaddr_type; + if (la.l2_cid) err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid)); else @@ -156,9 +159,6 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) break; } - bacpy(&chan->src, &la.l2_bdaddr); - chan->src_type = la.l2_bdaddr_type; - if (chan->psm && bdaddr_type_is_le(chan->src_type)) chan->mode = L2CAP_MODE_LE_FLOWCTL; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ffed8a1d4f27..4b175df35184 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1072,22 +1072,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) hcon->dst_type = smp->remote_irk->addr_type; queue_work(hdev->workqueue, &conn->id_addr_update_work); } - - /* When receiving an indentity resolving key for - * a remote device that does not use a resolvable - * private address, just remove the key so that - * it is possible to use the controller white - * list for scanning. - * - * Userspace will have been told to not store - * this key at this point. So it is safe to - * just remove it. - */ - if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) { - list_del_rcu(&smp->remote_irk->list); - kfree_rcu(smp->remote_irk, rcu); - smp->remote_irk = NULL; - } } if (smp->csrk) { diff --git a/net/bridge/br.c b/net/bridge/br.c index a1abe4936fe1..3addc05b9a16 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -121,6 +121,7 @@ static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; +/* called with RTNL */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -130,7 +131,6 @@ static int br_switchdev_event(struct notifier_block *unused, struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; - rtnl_lock(); p = br_port_get_rtnl(dev); if (!p) goto out; @@ -155,7 +155,6 @@ static int br_switchdev_event(struct notifier_block *unused, } out: - rtnl_unlock(); return err; } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 5e88d3e17546..2c8095a5d824 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -28,6 +28,8 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); +static struct lock_class_key bridge_netdev_addr_lock_key; + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -87,6 +89,11 @@ out: return NETDEV_TX_OK; } +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev) err = br_vlan_init(br); if (err) free_percpu(br->stats); + br_set_lockdep_class(dev); return err; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index c367b3e1b5ac..a73df3315df9 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -36,10 +36,10 @@ */ static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd; + struct ethtool_link_ksettings ecmd; - if (!__ethtool_get_settings(dev, &ecmd)) { - switch (ethtool_cmd_speed(&ecmd)) { + if (!__ethtool_get_link_ksettings(dev, &ecmd)) { + switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_1000: @@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head) destroy_nbp(p); } +static unsigned get_max_headroom(struct net_bridge *br) +{ + unsigned max_headroom = 0; + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); + + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + + return max_headroom; +} + +static void update_headroom(struct net_bridge *br, int new_hr) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) + netdev_set_rx_headroom(p->dev, new_hr); + + br->dev->needed_headroom = new_hr; +} + /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. @@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); list_del_rcu(&p->list); + if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) + update_headroom(br, get_max_headroom(br)); + netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); @@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + unsigned br_hr, dev_hr; bool changed_addr; /* Don't allow bridging non-ethernet like devices, or DSA-enabled @@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) netdev_update_features(br->dev); - if (br->dev->needed_headroom < dev->needed_headroom) - br->dev->needed_headroom = dev->needed_headroom; + br_hr = br->dev->needed_headroom; + dev_hr = netdev_get_fwd_headroom(dev); + if (br_hr < dev_hr) + update_headroom(br, dev_hr); + else + netdev_set_rx_headroom(dev, br_hr); if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 30e105f57f0d..253bc77eda3b 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; - struct nlattr *nest; + struct nlattr *nest, *port_nest; if (!br->multicast_router || hlist_empty(&br->router_list)) return 0; @@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, return -EMSGSIZE; hlist_for_each_entry_rcu(p, &br->router_list, rlist) { - if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex)) + if (!p) + continue; + port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT); + if (!port_nest) + goto fail; + if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || + nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, + br_timer_value(&p->multicast_router_timer)) || + nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, + p->multicast_router)) { + nla_nest_cancel(skb, port_nest); goto fail; + } + nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); @@ -41,6 +53,14 @@ fail: return -EMSGSIZE; } +static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) +{ + e->state = flags & MDB_PG_FLAGS_PERMANENT; + e->flags = 0; + if (flags & MDB_PG_FLAGS_OFFLOAD) + e->flags |= MDB_FLAGS_OFFLOAD; +} + static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { @@ -80,26 +100,41 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { + struct nlattr *nest_ent; + struct br_mdb_entry e; + port = p->port; - if (port) { - struct br_mdb_entry e; - memset(&e, 0, sizeof(e)); - e.ifindex = port->dev->ifindex; - e.state = p->state; - e.vid = p->addr.vid; - if (p->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = p->addr.u.ip4; + if (!port) + continue; + + memset(&e, 0, sizeof(e)); + e.ifindex = port->dev->ifindex; + e.vid = p->addr.vid; + __mdb_entry_fill_flags(&e, p->flags); + if (p->addr.proto == htons(ETH_P_IP)) + e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - if (p->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = p->addr.u.ip6; + if (p->addr.proto == htons(ETH_P_IPV6)) + e.addr.u.ip6 = p->addr.u.ip6; #endif - e.addr.proto = p->addr.proto; - if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) { - nla_nest_cancel(skb, nest2); - err = -EMSGSIZE; - goto out; - } + e.addr.proto = p->addr.proto; + nest_ent = nla_nest_start(skb, + MDBA_MDB_ENTRY_INFO); + if (!nest_ent) { + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; + } + if (nla_put_nohdr(skb, sizeof(e), &e) || + nla_put_u32(skb, + MDBA_MDB_EATTR_TIMER, + br_timer_value(&p->timer))) { + nla_nest_cancel(skb, nest_ent); + nla_nest_cancel(skb, nest2); + err = -EMSGSIZE; + goto out; } + nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest2); skip: @@ -209,7 +244,7 @@ static inline size_t rtnl_mdb_nlmsg_size(void) } static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, - int type) + int type, struct net_bridge_port_group *pg) { struct switchdev_obj_port_mdb mdb = { .obj = { @@ -232,10 +267,13 @@ static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, #endif mdb.obj.orig_dev = port_dev; - if (port_dev && type == RTM_NEWMDB) - switchdev_port_obj_add(port_dev, &mdb.obj); - else if (port_dev && type == RTM_DELMDB) + if (port_dev && type == RTM_NEWMDB) { + err = switchdev_port_obj_add(port_dev, &mdb.obj); + if (!err && pg) + pg->flags |= MDB_PG_FLAGS_OFFLOAD; + } else if (port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); + } skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); if (!skb) @@ -253,21 +291,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } -void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 state) +void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, + int type) { struct br_mdb_entry entry; memset(&entry, 0, sizeof(entry)); - entry.ifindex = port->dev->ifindex; - entry.addr.proto = group->proto; - entry.addr.u.ip4 = group->u.ip4; + entry.ifindex = pg->port->dev->ifindex; + entry.addr.proto = pg->addr.proto; + entry.addr.u.ip4 = pg->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - entry.addr.u.ip6 = group->u.ip6; + entry.addr.u.ip6 = pg->addr.u.ip6; #endif - entry.state = state; - entry.vid = group->vid; - __br_mdb_notify(dev, &entry, type); + entry.vid = pg->addr.vid; + __mdb_entry_fill_flags(&entry, pg->flags); + __br_mdb_notify(dev, &entry, type, pg); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, @@ -412,7 +450,8 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, unsigned char state) + struct br_ip *group, unsigned char state, + struct net_bridge_port_group **pg) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -425,8 +464,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, mp = br_mdb_ip_get(mdb, group); if (!mp) { mp = br_multicast_new_group(br, port, group); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + err = PTR_ERR_OR_ZERO(mp); + if (err) return err; } @@ -443,6 +482,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); + *pg = p; if (state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -450,7 +490,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } static int __br_mdb_add(struct net *net, struct net_bridge *br, - struct br_mdb_entry *entry) + struct br_mdb_entry *entry, + struct net_bridge_port_group **pg) { struct br_ip ip; struct net_device *dev; @@ -479,7 +520,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, #endif spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry->state); + ret = br_mdb_add_group(br, p, &ip, entry->state, pg); spin_unlock_bh(&br->multicast_lock); return ret; } @@ -487,6 +528,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); + struct net_bridge_port_group *pg; struct net_bridge_vlan_group *vg; struct net_device *dev, *pdev; struct br_mdb_entry *entry; @@ -516,15 +558,15 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_vlan_enabled(br) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, entry); + err = __br_mdb_add(net, br, entry, &pg); if (err) break; - __br_mdb_notify(dev, entry, RTM_NEWMDB); + __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); } } else { - err = __br_mdb_add(net, br, entry); + err = __br_mdb_add(net, br, entry, &pg); if (!err) - __br_mdb_notify(dev, entry, RTM_NEWMDB); + __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); } return err; @@ -568,7 +610,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (p->port->state == BR_STATE_DISABLED) goto unlock; - entry->state = p->state; + __mdb_entry_fill_flags(entry, p->flags); rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); @@ -620,12 +662,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) entry->vid = v->vid; err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + __br_mdb_notify(dev, entry, RTM_DELMDB, NULL); } } else { err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + __br_mdb_notify(dev, entry, RTM_DELMDB, NULL); } return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 03661d97463c..a4c15df2b792 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -283,8 +283,7 @@ static void br_multicast_del_pg(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, - p->state); + br_mdb_notify(br->dev, p, RTM_DELMDB); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && @@ -304,7 +303,7 @@ static void br_multicast_port_group_expired(unsigned long data) spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || - hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT) + hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; br_multicast_del_pg(br, pg); @@ -649,7 +648,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char state) + unsigned char flags) { struct net_bridge_port_group *p; @@ -659,7 +658,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->addr = *group; p->port = port; - p->state = state; + p->flags = flags; rcu_assign_pointer(p->next, next); hlist_add_head(&p->mglist, &port->mglist); setup_timer(&p->timer, br_multicast_port_group_expired, @@ -702,11 +701,11 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY); + p = br_multicast_new_port_group(port, group, *pp, 0); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY); + br_mdb_notify(br->dev, p, RTM_NEWMDB); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -760,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data) struct net_bridge *br = port->br; spin_lock(&br->multicast_lock); - if (port->multicast_router != 1 || + if (port->multicast_router == MDB_RTR_TYPE_DISABLED || + port->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(&port->multicast_router_timer) || hlist_unhashed(&port->rlist)) goto out; hlist_del_init_rcu(&port->rlist); br_rtr_notify(br->dev, port, RTM_DELMDB); + /* Don't allow timer refresh if the router expired */ + if (port->multicast_router == MDB_RTR_TYPE_TEMP) + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; out: spin_unlock(&br->multicast_lock); @@ -913,7 +916,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) void br_multicast_add_port(struct net_bridge_port *port) { - port->multicast_router = 1; + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; setup_timer(&port->multicast_router_timer, br_multicast_router_expired, (unsigned long)port); @@ -960,7 +963,8 @@ void br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif - if (port->multicast_router == 2 && hlist_unhashed(&port->rlist)) + if (port->multicast_router == MDB_RTR_TYPE_PERM && + hlist_unhashed(&port->rlist)) br_multicast_add_router(br, port); out: @@ -975,12 +979,15 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - if (pg->state == MDB_TEMPORARY) + if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(br, pg); if (!hlist_unhashed(&port->rlist)) { hlist_del_init_rcu(&port->rlist); br_rtr_notify(br->dev, port, RTM_DELMDB); + /* Don't allow timer refresh if disabling */ + if (port->multicast_router == MDB_RTR_TYPE_TEMP) + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } del_timer(&port->multicast_router_timer); del_timer(&port->ip4_own_query.timer); @@ -1228,13 +1235,14 @@ static void br_multicast_mark_router(struct net_bridge *br, unsigned long now = jiffies; if (!port) { - if (br->multicast_router == 1) + if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) mod_timer(&br->multicast_router_timer, now + br->multicast_querier_interval); return; } - if (port->multicast_router != 1) + if (port->multicast_router == MDB_RTR_TYPE_DISABLED || + port->multicast_router == MDB_RTR_TYPE_PERM) return; br_multicast_add_router(br, port); @@ -1453,8 +1461,7 @@ br_multicast_leave_group(struct net_bridge *br, hlist_del_init(&p->mglist); del_timer(&p->timer); call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, port, group, RTM_DELMDB, - p->state); + br_mdb_notify(br->dev, p, RTM_DELMDB); if (!mp->ports && !mp->mglist && netif_running(br->dev)) @@ -1715,7 +1722,7 @@ void br_multicast_init(struct net_bridge *br) br->hash_elasticity = 4; br->hash_max = 512; - br->multicast_router = 1; + br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_querier = 0; br->multicast_query_use_ifaddr = 0; br->multicast_last_member_count = 2; @@ -1825,11 +1832,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) spin_lock_bh(&br->multicast_lock); switch (val) { - case 0: - case 2: + case MDB_RTR_TYPE_DISABLED: + case MDB_RTR_TYPE_PERM: del_timer(&br->multicast_router_timer); /* fall through */ - case 1: + case MDB_RTR_TYPE_TEMP_QUERY: br->multicast_router = val; err = 0; break; @@ -1840,37 +1847,53 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) return err; } +static void __del_port_router(struct net_bridge_port *p) +{ + if (hlist_unhashed(&p->rlist)) + return; + hlist_del_init_rcu(&p->rlist); + br_rtr_notify(p->br->dev, p, RTM_DELMDB); +} + int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { struct net_bridge *br = p->br; + unsigned long now = jiffies; int err = -EINVAL; spin_lock(&br->multicast_lock); - - switch (val) { - case 0: - case 1: - case 2: - p->multicast_router = val; + if (p->multicast_router == val) { + /* Refresh the temp router port timer */ + if (p->multicast_router == MDB_RTR_TYPE_TEMP) + mod_timer(&p->multicast_router_timer, + now + br->multicast_querier_interval); err = 0; - - if (val < 2 && !hlist_unhashed(&p->rlist)) { - hlist_del_init_rcu(&p->rlist); - br_rtr_notify(br->dev, p, RTM_DELMDB); - } - - if (val == 1) - break; - + goto unlock; + } + switch (val) { + case MDB_RTR_TYPE_DISABLED: + p->multicast_router = MDB_RTR_TYPE_DISABLED; + __del_port_router(p); + del_timer(&p->multicast_router_timer); + break; + case MDB_RTR_TYPE_TEMP_QUERY: + p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + __del_port_router(p); + break; + case MDB_RTR_TYPE_PERM: + p->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&p->multicast_router_timer); - - if (val == 0) - break; - br_multicast_add_router(br, p); break; + case MDB_RTR_TYPE_TEMP: + p->multicast_router = MDB_RTR_TYPE_TEMP; + br_multicast_mark_router(br, p); + break; + default: + goto unlock; } - + err = 0; +unlock: spin_unlock(&br->multicast_lock); return err; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 40197ff8918a..e9c635eae24d 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -598,7 +598,6 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) return -ENETDOWN; br_set_state(p, state); - br_log_state(p); br_port_state_selection(p->br); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 216018c76018..1b5d145dfcbf 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -150,6 +150,9 @@ struct net_bridge_fdb_entry struct rcu_head rcu; }; +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) + struct net_bridge_port_group { struct net_bridge_port *port; struct net_bridge_port_group __rcu *next; @@ -157,7 +160,7 @@ struct net_bridge_port_group { struct rcu_head rcu; struct timer_list timer; struct br_ip addr; - unsigned char state; + unsigned char flags; }; struct net_bridge_mdb_entry @@ -554,11 +557,11 @@ void br_multicast_free_pg(struct rcu_head *head); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char state); + unsigned char flags); void br_mdb_init(void); void br_mdb_uninit(void); -void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type, u8 state); +void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, + int type); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); @@ -897,7 +900,6 @@ static inline void br_nf_core_fini(void) {} #endif /* br_stp.c */ -void br_log_state(const struct net_bridge_port *p); void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index b3cca126b103..c22816a0b1b1 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -30,13 +30,6 @@ static const char *const br_port_state_names[] = { [BR_STATE_BLOCKING] = "blocking", }; -void br_log_state(const struct net_bridge_port *p) -{ - br_info(p->br, "port %u(%s) entered %s state\n", - (unsigned int) p->port_no, p->dev->name, - br_port_state_names[p->state]); -} - void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { @@ -52,6 +45,10 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); + else + br_info(p->br, "port %u(%s) entered %s state\n", + (unsigned int) p->port_no, p->dev->name, + br_port_state_names[p->state]); } /* called under bridge lock */ @@ -126,7 +123,6 @@ static void br_root_port_block(const struct net_bridge *br, (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay > 0) @@ -407,7 +403,6 @@ static void br_make_blocking(struct net_bridge_port *p) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->forward_delay_timer); @@ -431,7 +426,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); if (br->forward_delay != 0) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index a31ac6ad76a2..984d46263007 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -102,7 +102,6 @@ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); } @@ -118,7 +117,6 @@ void br_stp_disable_port(struct net_bridge_port *p) p->topology_change_ack = 0; p->config_pending = 0; - br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->message_age_timer); diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 5f0f5af0ec35..da058b85aa22 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -98,7 +98,6 @@ static void br_forward_delay_timer_expired(unsigned long arg) br_topology_change_detection(br); netif_carrier_on(br->dev); } - br_log_state(p); rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); rcu_read_unlock(); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 85e43af4af7a..9309bb4f2a5b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -955,6 +955,13 @@ err_rhtbl: */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { + struct switchdev_obj_port_vlan v = { + .obj.orig_dev = port->dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; struct net_bridge_vlan *vlan; int ret; @@ -962,6 +969,10 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { + /* Pass the flags to the hardware bridge */ + ret = switchdev_port_obj_add(port->dev, &v.obj); + if (ret && ret != -EOPNOTSUPP) + return ret; __vlan_add_flags(vlan, flags); return 0; } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index fdba3d9fbff3..adc8d7221dbb 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -48,6 +48,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _oth; + struct net *net = sock_net(oldskb->sk); if (!nft_bridge_iphdr_validate(oldskb)) return; @@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - niph->ttl = sysctl_ip_default_ttl; + niph->ttl = net->ipv4.sysctl_ip_default_ttl; niph->tot_len = htons(nskb->len); ip_send_check(niph); @@ -85,6 +86,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, void *payload; __wsum csum; u8 proto; + struct net *net = sock_net(oldskb->sk); if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) return; @@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); skb_reset_transport_header(nskb); icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 61d7617d9249..b82440e1fcb4 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -159,7 +159,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt) tmppkt = NULL; /* Verify that length is correct */ - err = EPROTO; + err = -EPROTO; if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1) goto out; } diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 10d87753ed87..9e43a315e662 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac, void *ticket_buf = NULL; void *tp, *tpend; void **ptp; - struct ceph_timespec new_validity; struct ceph_crypto_key new_session_key; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; @@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); + ceph_decode_timespec(&validity, dp); + dp += sizeof(struct ceph_timespec); new_expires = get_seconds() + validity.tv_sec; new_renew_after = new_expires - (validity.tv_sec / 4); dout(" expires=%lu renew_after=%lu\n", new_expires, @@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_buffer_put(th->ticket_blob); th->session_key = new_session_key; th->ticket_blob = new_ticket_blob; - th->validity = new_validity; th->secret_id = new_secret_id; th->expires = new_expires; th->renew_after = new_renew_after; + th->have_key = true; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); @@ -384,6 +383,24 @@ bad: return -ERANGE; } +static bool need_key(struct ceph_x_ticket_handler *th) +{ + if (!th->have_key) + return true; + + return get_seconds() >= th->renew_after; +} + +static bool have_key(struct ceph_x_ticket_handler *th) +{ + if (th->have_key) { + if (get_seconds() >= th->expires) + th->have_key = false; + } + + return th->have_key; +} + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) { int want = ac->want_keys; @@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) continue; th = get_ticket_handler(ac, service); - if (IS_ERR(th)) { *pneed |= service; continue; } - if (get_seconds() >= th->renew_after) + if (need_key(th)) *pneed |= service; - if (get_seconds() >= th->expires) + if (!have_key(th)) xi->have_keys &= ~service; } } - static int ceph_x_build_request(struct ceph_auth_client *ac, void *buf, void *end) { @@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) ac->private = NULL; } -static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, - int peer_type) +static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) { struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - memset(&th->validity, 0, sizeof(th->validity)); + th->have_key = false; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + /* + * We are to invalidate a service ticket in the hopes of + * getting a new, hopefully more valid, one. But, we won't get + * it unless our AUTH ticket is good, so invalidate AUTH ticket + * as well, just in case. + */ + invalidate_ticket(ac, peer_type); + invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); } static int calcu_signature(struct ceph_x_authorizer *au, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index e8b7c6917d47..40b1a3cf7397 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -16,7 +16,7 @@ struct ceph_x_ticket_handler { unsigned int service; struct ceph_crypto_key session_key; - struct ceph_timespec validity; + bool have_key; u64 secret_id; struct ceph_buffer *ticket_blob; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 393bfb22d5bb..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent @@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_fallback_retries, int recurse_to_leaf, unsigned int vary_r, + unsigned int stable, int *out2, int parent_r) { @@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, int collide, reject; int count = out_size; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep, tries, recurse_tries, local_retries, local_fallback_retries, - parent_r); + parent_r, stable); - for (rep = outpos; rep < numrep && count > 0 ; rep++) { + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, - x, outpos+1, 0, + x, stable ? 1 : outpos+1, 0, out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, 0, vary_r, + stable, NULL, sub_r) <= outpos) /* didn't get leaf */ @@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, int choose_local_fallback_retries = map->choose_local_fallback_tries; int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_TAKE: if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) || - (-1-curstep->arg1 < map->max_buckets && + (-1-curstep->arg1 >= 0 && + -1-curstep->arg1 < map->max_buckets && map->buckets[-1-curstep->arg1])) { w[0] = curstep->arg1; wsize = 1; @@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, vary_r = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map, osize = 0; for (i = 0; i < wsize; i++) { + int bno; /* * see CRUSH_N, CRUSH_N_MINUS macros. * basically, numrep <= 0 means relative to @@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; + /* make sure bucket id is valid */ + bno = -1 - w[i]; + if (bno < 0 || bno >= map->max_buckets) { + /* w[i] is probably CRUSH_ITEM_NONE */ + dprintk(" bad w[i] %d\n", w[i]); + continue; + } if (firstn) { int recurse_tries; if (choose_leaf_tries) @@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, numrep, curstep->arg2, @@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries, recurse_to_leaf, vary_r, + stable, c+osize, 0); } else { @@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, out_size, numrep, curstep->arg2, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9981039ef4ff..9cfedf565f5b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -23,9 +23,6 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> -#define list_entry_next(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con) } con->in_seq = 0; con->in_seq_acked = 0; + + con->out_skip = 0; } /* @@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) static void con_out_kvec_reset(struct ceph_connection *con) { + BUG_ON(con->out_skip); + con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; @@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { - int index; + int index = con->out_kvec_left; - index = con->out_kvec_left; + BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; @@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int off = con->out_kvec_cur - con->out_kvec; + int skip = 0; + + if (con->out_kvec_bytes > 0) { + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; + BUG_ON(con->out_kvec_bytes < skip); + BUG_ON(!con->out_kvec_left); + con->out_kvec_bytes -= skip; + con->out_kvec_left--; + } + + return skip; +} + #ifdef CONFIG_BLOCK /* @@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); - cursor->page = list_entry_next(cursor->page, lru); + cursor->page = list_next_entry(cursor->page, lru); cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; @@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_entry_next(cursor->data, links); + cursor->data = list_next_entry(cursor->data, links); __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con) m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) @@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con) u32 crc; con_out_kvec_reset(con); - con->out_kvec_is_msg = true; con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same @@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con) /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); - /* fill in crc (except data pages), footer */ + /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = 0; + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); + /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { @@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; @@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con) } } con->out_kvec_left = 0; - con->out_kvec_is_msg = false; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, @@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con) { int ret; + dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); @@ -2506,13 +2528,13 @@ more: more_kvec: /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); + if (con->out_kvec_left) { + ret = write_partial_kvec(con); if (ret <= 0) goto out; } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); + if (con->out_skip) { + ret = write_partial_skip(con); if (ret <= 0) goto out; } @@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con) static void con_fault_finish(struct ceph_connection *con) { + dout("%s %p\n", __func__, con); + /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); + if (con->auth_retry) { + dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->ops->invalidate_authorizer) + con->ops->invalidate_authorizer(con); + con->auth_retry = 0; } if (con->ops->fault) @@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg) ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("%s %p msg %p - was sending\n", __func__, con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; + BUG_ON(con->out_skip); + /* footer */ + if (con->out_msg_done) { + con->out_skip += con_out_kvec_skip(con); + } else { + BUG_ON(!msg->data_length); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) + con->out_skip += sizeof(msg->footer); + else + con->out_skip += sizeof(msg->old_footer); } + /* data, middle, front */ + if (msg->data_length) + con->out_skip += msg->cursor.total_resid; + if (msg->middle) + con->out_skip += con_out_kvec_skip(con); + con->out_skip += con_out_kvec_skip(con); + + dout("%s %p msg %p - was sending, will write %d skip %d\n", + __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; - + con->out_msg = NULL; ceph_msg_put(msg); } + mutex_unlock(&con->mutex); } @@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m) static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - LIST_HEAD(data); - struct list_head *links; - struct list_head *next; + struct ceph_msg_data *data, *next; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_splice_init(&m->data, &data); - list_for_each_safe(links, next, &data) { - struct ceph_msg_data *data; - - data = list_entry(links, struct ceph_msg_data, links); - list_del_init(links); + list_for_each_entry_safe(data, next, &m->data, links) { + list_del_init(&data->links); ceph_msg_data_destroy(data); } m->data_length = 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index edda01626a45..de85dddc3dc0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc) return monc->client->have_fsid && monc->auth->global_id > 0; } -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f235930d88..3534e12683d3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 osdmap_epoch; int already_completed; u32 bytes; + u8 decode_redir; unsigned int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) p += 8 + 4; /* skip replay_version */ p += 8; /* skip user_version */ + if (le16_to_cpu(msg->hdr.version) >= 7) + ceph_decode_8_safe(&p, end, decode_redir, bad_put); + else + decode_redir = 1; + } else { + decode_redir = 0; + } + + if (decode_redir) { err = ceph_redirect_decode(&p, end, &redir); if (err) goto bad_put; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f..243574c8cf33 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); - dout("crush decode tunable choose_local_tries = %d", + dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); - dout("crush decode tunable choose_local_fallback_tries = %d", + dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); - dout("crush decode tunable choose_total_tries = %d", + dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); - dout("crush decode tunable chooseleaf_descend_once = %d", + dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); - dout("crush decode tunable chooseleaf_vary_r = %d", + dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); + /* skip straw_calc_version, allowed_bucket_algs */ + ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); + *p += sizeof(u8) + sizeof(u32); + + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_stable = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_stable = %d\n", + c->chooseleaf_stable); + done: dout("crush_decode success\n"); return c; diff --git a/net/core/Makefile b/net/core/Makefile index 0b835de04de3..014422e2561f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,3 +24,5 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o +obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/dev.c b/net/core/dev.c index 0ca95d5d7af0..edb7179bc051 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2695,6 +2695,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -2709,6 +2711,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; @@ -3824,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h) trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); } + + __kfree_skb_flush(); } if (sd->output_queue) { @@ -4149,7 +4160,10 @@ ncls: ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); + if (!deliver_exact) + atomic_long_inc(&skb->dev->rx_dropped); + else + atomic_long_inc(&skb->dev->rx_nohandler); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -4346,6 +4360,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_metadata_dst_cmp(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); @@ -4543,10 +4558,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { + skb_dst_drop(skb); kmem_cache_free(skbuff_head_cache, skb); - else + } else { __kfree_skb(skb); + } break; case GRO_HELD: @@ -5144,6 +5161,7 @@ static void net_rx_action(struct softirq_action *h) } } + __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -5371,12 +5389,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_entry((*iter)->next, struct netdev_adjacent, list); + lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; - *iter = &lower->list; + *iter = lower->list.next; return lower->dev; } @@ -7245,24 +7263,31 @@ void netdev_run_todo(void) } } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { #if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); memcpy(stats64, netdev_stats, sizeof(*stats64)); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + sizeof(*netdev_stats), 0, + sizeof(*stats64) - sizeof(*netdev_stats)); #else - size_t i, n = sizeof(*stats64) / sizeof(u64); + size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); const unsigned long *src = (const unsigned long *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = src[i]; + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); #endif } EXPORT_SYMBOL(netdev_stats_to_stats64); @@ -7292,6 +7317,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); storage->tx_dropped += atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -7414,8 +7440,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); - if (!dev->tx_queue_len) + if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = 1; + } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; diff --git a/net/core/devlink.c b/net/core/devlink.c new file mode 100644 index 000000000000..590fa561cb7f --- /dev/null +++ b/net/core/devlink.c @@ -0,0 +1,738 @@ +/* + * net/core/devlink.c - Network physical/parent device Netlink interface + * + * Heavily inspired by net/wireless/ + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/device.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <rdma/ib_verbs.h> +#include <net/netlink.h> +#include <net/genetlink.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/devlink.h> + +static LIST_HEAD(devlink_list); + +/* devlink_mutex + * + * An overall lock guarding every operation coming from userspace. + * It also guards devlink devices list and it is taken when + * driver registers/unregisters it. + */ +static DEFINE_MUTEX(devlink_mutex); + +/* devlink_port_mutex + * + * Shared lock to guard lists of ports in all devlink devices. + */ +static DEFINE_MUTEX(devlink_port_mutex); + +static struct net *devlink_net(const struct devlink *devlink) +{ + return read_pnet(&devlink->_net); +} + +static void devlink_net_set(struct devlink *devlink, struct net *net) +{ + write_pnet(&devlink->_net, net); +} + +static struct devlink *devlink_get_from_attrs(struct net *net, + struct nlattr **attrs) +{ + struct devlink *devlink; + char *busname; + char *devname; + + if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME]) + return ERR_PTR(-EINVAL); + + busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); + devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + + list_for_each_entry(devlink, &devlink_list, list) { + if (strcmp(devlink->dev->bus->name, busname) == 0 && + strcmp(dev_name(devlink->dev), devname) == 0 && + net_eq(devlink_net(devlink), net)) + return devlink; + } + + return ERR_PTR(-ENODEV); +} + +static struct devlink *devlink_get_from_info(struct genl_info *info) +{ + return devlink_get_from_attrs(genl_info_net(info), info->attrs); +} + +static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, + int port_index) +{ + struct devlink_port *devlink_port; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (devlink_port->index == port_index) + return devlink_port; + } + return NULL; +} + +static bool devlink_port_index_exists(struct devlink *devlink, int port_index) +{ + return devlink_port_get_by_index(devlink, port_index); +} + +static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) + return ERR_PTR(-ENODEV); + return devlink_port; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_port_get_from_attrs(devlink, info->attrs); +} + +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) + +static int devlink_nl_pre_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_info(info); + if (IS_ERR(devlink)) { + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink); + } + info->user_ptr[0] = devlink; + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + struct devlink_port *devlink_port; + + mutex_lock(&devlink_port_mutex); + devlink_port = devlink_port_get_from_info(devlink, info); + if (IS_ERR(devlink_port)) { + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink_port); + } + info->user_ptr[1] = devlink_port; + } + return 0; +} + +static void devlink_nl_post_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); +} + +static struct genl_family devlink_nl_family = { + .id = GENL_ID_GENERATE, + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, +}; + +enum devlink_multicast_groups { + DEVLINK_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group devlink_nl_mcgrps[] = { + [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, +}; + +static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) +{ + if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name)) + return -EMSGSIZE; + if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev))) + return -EMSGSIZE; + return 0; +} + +static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) + goto nla_put_failure; + if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && + nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, + devlink_port->desired_type)) + goto nla_put_failure; + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + struct net_device *netdev = devlink_port->type_dev; + + if (netdev && + (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, + netdev->ifindex) || + nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, + netdev->name))) + goto nla_put_failure; + } + if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { + struct ib_device *ibdev = devlink_port->type_dev; + + if (ibdev && + nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, + ibdev->name)) + goto nla_put_failure; + } + if (devlink_port->split && + nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + devlink_port->split_group)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_port->devlink; + struct sk_buff *msg; + int err; + + if (!devlink_port->registered) + return; + + WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + goto out; + idx++; + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_PORT_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_port *devlink_port; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_port_fill(msg, devlink, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) + goto out; + idx++; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_port_type_set(struct devlink *devlink, + struct devlink_port *devlink_port, + enum devlink_port_type port_type) + +{ + int err; + + if (devlink->ops && devlink->ops->port_type_set) { + if (port_type == DEVLINK_PORT_TYPE_NOTSET) + return -EINVAL; + err = devlink->ops->port_type_set(devlink_port, port_type); + if (err) + return err; + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; + } + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; + int err; + + if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { + enum devlink_port_type port_type; + + port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); + err = devlink_port_type_set(devlink, devlink_port, port_type); + if (err) + return err; + } + return 0; +} + +static int devlink_port_split(struct devlink *devlink, + u32 port_index, u32 count) + +{ + if (devlink->ops && devlink->ops->port_split) + return devlink->ops->port_split(devlink, port_index, count); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + u32 count; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] || + !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + return devlink_port_split(devlink, port_index, count); +} + +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) + +{ + if (devlink->ops && devlink->ops->port_unsplit) + return devlink->ops->port_unsplit(devlink, port_index); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + u32 port_index; + + if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) + return -EINVAL; + + port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + return devlink_port_unsplit(devlink, port_index); +} + +static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, +}; + +static const struct genl_ops devlink_nl_ops[] = { + { + .cmd = DEVLINK_CMD_GET, + .doit = devlink_nl_cmd_get_doit, + .dumpit = devlink_nl_cmd_get_dumpit, + .policy = devlink_nl_policy, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .doit = devlink_nl_cmd_port_get_doit, + .dumpit = devlink_nl_cmd_port_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_SET, + .doit = devlink_nl_cmd_port_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_SPLIT, + .doit = devlink_nl_cmd_port_split_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .doit = devlink_nl_cmd_port_unsplit_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +/** + * devlink_alloc - Allocate new devlink instance resources + * + * @ops: ops + * @priv_size: size of user private data + * + * Allocate new devlink instance resources, including devlink index + * and name. + */ +struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +{ + struct devlink *devlink; + + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + if (!devlink) + return NULL; + devlink->ops = ops; + devlink_net_set(devlink, &init_net); + INIT_LIST_HEAD(&devlink->port_list); + return devlink; +} +EXPORT_SYMBOL_GPL(devlink_alloc); + +/** + * devlink_register - Register devlink instance + * + * @devlink: devlink + */ +int devlink_register(struct devlink *devlink, struct device *dev) +{ + mutex_lock(&devlink_mutex); + devlink->dev = dev; + list_add_tail(&devlink->list, &devlink_list); + devlink_notify(devlink, DEVLINK_CMD_NEW); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_register); + +/** + * devlink_unregister - Unregister devlink instance + * + * @devlink: devlink + */ +void devlink_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink_notify(devlink, DEVLINK_CMD_DEL); + list_del(&devlink->list); + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_unregister); + +/** + * devlink_free - Free devlink instance resources + * + * @devlink: devlink + */ +void devlink_free(struct devlink *devlink) +{ + kfree(devlink); +} +EXPORT_SYMBOL_GPL(devlink_free); + +/** + * devlink_port_register - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + */ +int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + mutex_lock(&devlink_port_mutex); + if (devlink_port_index_exists(devlink, port_index)) { + mutex_unlock(&devlink_port_mutex); + return -EEXIST; + } + devlink_port->devlink = devlink; + devlink_port->index = port_index; + devlink_port->type = DEVLINK_PORT_TYPE_NOTSET; + devlink_port->registered = true; + list_add_tail(&devlink_port->list, &devlink->port_list); + mutex_unlock(&devlink_port_mutex); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_register); + +/** + * devlink_port_unregister - Unregister devlink port + * + * @devlink_port: devlink port + */ +void devlink_port_unregister(struct devlink_port *devlink_port) +{ + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + mutex_lock(&devlink_port_mutex); + list_del(&devlink_port->list); + mutex_unlock(&devlink_port_mutex); +} +EXPORT_SYMBOL_GPL(devlink_port_unregister); + +static void __devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type type, + void *type_dev) +{ + devlink_port->type = type; + devlink_port->type_dev = type_dev; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * @netdev: related netdevice + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_ETH, netdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); + +/** + * devlink_port_type_ib_set - Set port type to InfiniBand + * + * @devlink_port: devlink port + * @ibdev: related IB device + */ +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_IB, ibdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); + +/** + * devlink_port_type_clear - Clear port type + * + * @devlink_port: devlink port + */ +void devlink_port_type_clear(struct devlink_port *devlink_port) +{ + return __devlink_port_type_set(devlink_port, + DEVLINK_PORT_TYPE_NOTSET, NULL); +} +EXPORT_SYMBOL_GPL(devlink_port_type_clear); + +/** + * devlink_port_split_set - Set port is split + * + * @devlink_port: devlink port + * @split_group: split group - identifies group split port is part of + */ +void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group) +{ + devlink_port->split = true; + devlink_port->split_group = split_group; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_split_set); + +static int __init devlink_module_init(void) +{ + return genl_register_family_with_ops_groups(&devlink_nl_family, + devlink_nl_ops, + devlink_nl_mcgrps); +} + +static void __exit devlink_module_exit(void) +{ + genl_unregister_family(&devlink_nl_family); +} + +module_init(devlink_module_init); +module_exit(devlink_module_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); +MODULE_DESCRIPTION("Network physical device Netlink interface"); +MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME); diff --git a/net/core/dst.c b/net/core/dst.c index a1656e3b8d72..b5cbbe07f786 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -265,7 +265,7 @@ again: lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) - kfree(dst); + metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); @@ -395,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} + struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) { int cpu; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000000..3938f3f38d69 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,168 @@ +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *addr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index daf04709dd3c..2966cd0d7c93 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", + [NETIF_F_HW_TC_BIT] = "hw-tc-offload", }; static const char @@ -386,43 +387,461 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } -int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { + bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); + dst[0] = legacy_u32; +} + +/* return false if src had higher bits set. lower bits always updated. */ +static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32, + const unsigned long *src) +{ + bool retval = true; + + /* TODO: following test will soon always be true */ + if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); + + bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_fill(ext, 32); + bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + if (bitmap_intersects(ext, src, + __ETHTOOL_LINK_MODE_MASK_NBITS)) { + /* src mask goes beyond bit 31 */ + retval = false; + } + } + *legacy_u32 = src[0]; + return retval; +} + +/* return false if legacy contained non-0 deprecated fields + * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated + */ +static bool +convert_legacy_settings_to_link_ksettings( + struct ethtool_link_ksettings *link_ksettings, + const struct ethtool_cmd *legacy_settings) +{ + bool retval = true; + + memset(link_ksettings, 0, sizeof(*link_ksettings)); + + /* This is used to tell users that driver is still using these + * deprecated legacy fields, and they should not use + * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS + */ + if (legacy_settings->transceiver || + legacy_settings->maxtxpkt || + legacy_settings->maxrxpkt) + retval = false; + + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.supported, + legacy_settings->supported); + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.advertising, + legacy_settings->advertising); + convert_legacy_u32_to_link_mode( + link_ksettings->link_modes.lp_advertising, + legacy_settings->lp_advertising); + link_ksettings->base.speed + = ethtool_cmd_speed(legacy_settings); + link_ksettings->base.duplex + = legacy_settings->duplex; + link_ksettings->base.port + = legacy_settings->port; + link_ksettings->base.phy_address + = legacy_settings->phy_address; + link_ksettings->base.autoneg + = legacy_settings->autoneg; + link_ksettings->base.mdio_support + = legacy_settings->mdio_support; + link_ksettings->base.eth_tp_mdix + = legacy_settings->eth_tp_mdix; + link_ksettings->base.eth_tp_mdix_ctrl + = legacy_settings->eth_tp_mdix_ctrl; + return retval; +} + +/* return false if ksettings link modes had higher bits + * set. legacy_settings always updated (best effort) + */ +static bool +convert_link_ksettings_to_legacy_settings( + struct ethtool_cmd *legacy_settings, + const struct ethtool_link_ksettings *link_ksettings) +{ + bool retval = true; + + memset(legacy_settings, 0, sizeof(*legacy_settings)); + /* this also clears the deprecated fields in legacy structure: + * __u8 transceiver; + * __u32 maxtxpkt; + * __u32 maxrxpkt; + */ + + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->supported, + link_ksettings->link_modes.supported); + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->advertising, + link_ksettings->link_modes.advertising); + retval &= convert_link_mode_to_legacy_u32( + &legacy_settings->lp_advertising, + link_ksettings->link_modes.lp_advertising); + ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); + legacy_settings->duplex + = link_ksettings->base.duplex; + legacy_settings->port + = link_ksettings->base.port; + legacy_settings->phy_address + = link_ksettings->base.phy_address; + legacy_settings->autoneg + = link_ksettings->base.autoneg; + legacy_settings->mdio_support + = link_ksettings->base.mdio_support; + legacy_settings->eth_tp_mdix + = link_ksettings->base.eth_tp_mdix; + legacy_settings->eth_tp_mdix_ctrl + = link_ksettings->base.eth_tp_mdix_ctrl; + return retval; +} + +/* number of 32-bit words to store the user's link mode bitmaps */ +#define __ETHTOOL_LINK_MODE_MASK_NU32 \ + DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) + +/* layout of the struct passed from/to userland */ +struct ethtool_link_usettings { + struct ethtool_link_settings base; + struct { + __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; + } link_modes; +}; + +/* Internal kernel helper to query a device ethtool_link_settings. + * + * Backward compatibility note: for compatibility with legacy drivers + * that implement only the ethtool_cmd API, this has to work with both + * drivers implementing get_link_ksettings API and drivers + * implementing get_settings API. When drivers implement get_settings + * and report ethtool_cmd deprecated fields + * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored + * because the resulting struct ethtool_link_settings does not report them. + */ +int __ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *link_ksettings) +{ + int err; + struct ethtool_cmd cmd; + ASSERT_RTNL(); + if (dev->ethtool_ops->get_link_ksettings) { + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, + link_ksettings); + } + + /* driver doesn't support %ethtool_link_ksettings API. revert to + * legacy %ethtool_cmd API, unless it's not supported either. + * TODO: remove when ethtool_ops::get_settings disappears internally + */ if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt + */ + convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); + return err; } -EXPORT_SYMBOL(__ethtool_get_settings); +EXPORT_SYMBOL(__ethtool_get_link_ksettings); -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +/* convert ethtool_link_usettings in user space to a kernel internal + * ethtool_link_ksettings. return 0 on success, errno on error. + */ +static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, + const void __user *from) { - int err; - struct ethtool_cmd cmd; + struct ethtool_link_usettings link_usettings; + + if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) + return -EFAULT; + + memcpy(&to->base, &link_usettings.base, sizeof(to->base)); + bitmap_from_u32array(to->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + bitmap_from_u32array(to->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32); + + return 0; +} + +/* convert a kernel internal ethtool_link_ksettings to + * ethtool_link_usettings in user space. return 0 on success, errno on + * error. + */ +static int +store_link_ksettings_for_user(void __user *to, + const struct ethtool_link_ksettings *from) +{ + struct ethtool_link_usettings link_usettings; + + memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); + bitmap_to_u32array(link_usettings.link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + bitmap_to_u32array(link_usettings.link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NU32, + from->link_modes.lp_advertising, + __ETHTOOL_LINK_MODE_MASK_NBITS); + + if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) + return -EFAULT; + + return 0; +} + +/* Query device for its ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::get_link_ksettings, even if legacy + * ethtool_ops::get_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_GSET for + * this driver, so that they can correctly access the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_get_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err = 0; + struct ethtool_link_ksettings link_ksettings; - err = __ethtool_get_settings(dev, &cmd); + ASSERT_RTNL(); + + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; + + /* handle bitmap nbits handshake */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) { + /* wrong link mode nbits requested */ + memset(&link_ksettings, 0, sizeof(link_ksettings)); + /* keep cmd field reset to 0 */ + /* send back number of words required as negative val */ + compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, + "need too many bits for link modes!"); + link_ksettings.base.link_mode_masks_nwords + = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); + + /* copy the base fields back to user, not the link + * mode bitmaps + */ + if (copy_to_user(useraddr, &link_ksettings.base, + sizeof(link_ksettings.base))) + return -EFAULT; + + return 0; + } + + /* handshake successful: user/kernel agree on + * link_mode_masks_nwords + */ + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; + /* make sure we tell the right values to user */ + link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + + return store_link_ksettings_for_user(useraddr, &link_ksettings); +} + +/* Update device ethtool_link_settings. + * + * Backward compatibility note: this function must fail when driver + * does not implement ethtool::set_link_ksettings, even if legacy + * ethtool_ops::set_settings is implemented. This tells new versions + * of ethtool that they should use the legacy API %ETHTOOL_SSET for + * this driver, so that they can correctly update the ethtool_cmd + * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver + * implements ethtool_ops::get_settings anymore. + */ +static int ethtool_set_link_ksettings(struct net_device *dev, + void __user *useraddr) +{ + int err; + struct ethtool_link_ksettings link_ksettings; + + ASSERT_RTNL(); + + if (!dev->ethtool_ops->set_link_ksettings) + return -EOPNOTSUPP; + + /* make sure nbits field has expected value */ + if (copy_from_user(&link_ksettings.base, useraddr, + sizeof(link_ksettings.base))) + return -EFAULT; + + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + /* copy the whole structure, now that we know it has expected + * format + */ + err = load_link_ksettings_from_user(&link_ksettings, useraddr); + if (err) + return err; + + /* re-check nwords field, just in case */ + if (__ETHTOOL_LINK_MODE_MASK_NU32 + != link_ksettings.base.link_mode_masks_nwords) + return -EINVAL; + + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); +} + +static void +warn_incomplete_ethtool_legacy_settings_conversion(const char *details) +{ + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", + get_task_comm(name, current), details); +} + +/* Query device for its ethtool_cmd settings. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing get_link_ksettings + * API and drivers implementing get_settings API. When drivers + * implement get_link_ksettings and report higher link mode bits, a + * kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful + * (only the lower link mode bits reported back to user). + */ +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + ASSERT_RTNL(); + + if (dev->ethtool_ops->get_link_ksettings) { + /* First, use link_ksettings API if it is supported */ + int err; + struct ethtool_link_ksettings link_ksettings; + + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, + &link_ksettings); + if (err < 0) + return err; + if (!convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings)) + warn_incomplete_ethtool_legacy_settings_conversion( + "link modes are only partially reported"); + + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; + } else { + /* driver doesn't support %ethtool_link_ksettings + * API. revert to legacy %ethtool_cmd API, unless it's + * not supported either. + */ + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(&cmd, 0, sizeof(cmd)); + cmd.cmd = ETHTOOL_GSET; + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + } + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; + return 0; } +/* Update device link settings with given ethtool_cmd. + * + * Backward compatibility note: for compatibility with legacy ethtool, + * this has to work with both drivers implementing set_link_ksettings + * API and drivers implementing set_settings API. When drivers + * implement set_link_ksettings and user's request updates deprecated + * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, and the request is rejected. + */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd; - if (!dev->ethtool_ops->set_settings) - return -EOPNOTSUPP; + ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; + /* first, try new %ethtool_link_ksettings API. */ + if (dev->ethtool_ops->set_link_ksettings) { + struct ethtool_link_ksettings link_ksettings; + + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, + &cmd)) + return -EINVAL; + + link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; + link_ksettings.base.link_mode_masks_nwords + = __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, + &link_ksettings); + } + + /* legacy %ethtool_cmd API */ + + /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings + * disappears internally + */ + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + return dev->ethtool_ops->set_settings(dev, &cmd); } @@ -632,7 +1051,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } -u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { @@ -642,6 +1061,37 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); +static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -738,6 +1188,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (user_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); @@ -897,6 +1355,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (ret) + goto out; + + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rss_config); @@ -1227,14 +1693,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels; + struct ethtool_channels channels, max; + u32 max_rx_in_use = 0; - if (!dev->ethtool_ops->set_channels) + if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; + dev->ethtool_ops->get_channels(dev, &max); + + /* ensure new counts are within the maximums */ + if ((channels.rx_count > max.max_rx) || + (channels.tx_count > max.max_tx) || + (channels.combined_count > max.max_combined) || + (channels.other_count > max.max_other)) + return -EINVAL; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } @@ -1823,13 +2306,121 @@ out: return ret; } +static int ethtool_get_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int ret; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if (!dev->ethtool_ops->get_per_queue_coalesce) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + return ret; + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + useraddr += sizeof(coalesce); + } + + return 0; +} + +static int ethtool_set_per_queue_coalesce(struct net_device *dev, + void __user *useraddr, + struct ethtool_per_queue_op *per_queue_opt) +{ + u32 bit; + int i, ret = 0; + int n_queue; + struct ethtool_coalesce *backup = NULL, *tmp = NULL; + DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); + + if ((!dev->ethtool_ops->set_per_queue_coalesce) || + (!dev->ethtool_ops->get_per_queue_coalesce)) + return -EOPNOTSUPP; + + useraddr += sizeof(*per_queue_opt); + + bitmap_from_u32array(queue_mask, + MAX_NUM_QUEUE, + per_queue_opt->queue_mask, + DIV_ROUND_UP(MAX_NUM_QUEUE, 32)); + n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); + tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + if (!backup) + return -ENOMEM; + + for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { + struct ethtool_coalesce coalesce; + + ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); + if (ret != 0) + goto roll_back; + + tmp++; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { + ret = -EFAULT; + goto roll_back; + } + + ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); + if (ret != 0) + goto roll_back; + + useraddr += sizeof(coalesce); + } + +roll_back: + if (ret != 0) { + tmp = backup; + for_each_set_bit(i, queue_mask, bit) { + dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); + tmp++; + } + } + kfree(backup); + + return ret; +} + +static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_per_queue_op per_queue_opt; + + if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) + return -EFAULT; + + switch (per_queue_opt.sub_command) { + case ETHTOOL_GCOALESCE: + return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); + case ETHTOOL_SCOALESCE: + return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); + default: + return -EOPNOTSUPP; + }; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; - u32 ethcmd; + u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; @@ -1839,8 +2430,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (ethcmd == ETHTOOL_PERQUEUE) { + if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) + return -EFAULT; + } else { + sub_cmd = ethcmd; + } /* Allow some commands to be done by anyone */ - switch (ethcmd) { + switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: @@ -2070,6 +2667,15 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; + case ETHTOOL_PERQUEUE: + rc = ethtool_set_per_queue(dev, useraddr); + break; + case ETHTOOL_GLINKSETTINGS: + rc = ethtool_get_link_ksettings(dev, useraddr); + break; + case ETHTOOL_SLINKSETTINGS: + rc = ethtool_set_link_ksettings(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index 94d26201080d..a3aba15a8025 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -530,12 +530,14 @@ do_pass: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - /* RET_K, RET_A are remaped into 2 insns. */ + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: - *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? - BPF_K : BPF_X, BPF_REG_0, - BPF_REG_A, fp->k); + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); *insn = BPF_EXIT_INSN(); break; @@ -1181,7 +1183,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) if (bpf_prog_size(prog->len) > sysctl_optmem_max) return -ENOMEM; - if (sk_unhashed(sk)) { + if (sk_unhashed(sk) && sk->sk_reuseport) { err = reuseport_alloc(sk); if (err) return err; @@ -1333,15 +1335,22 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_LDST_LEN 16U +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); struct sk_buff *skb = (struct sk_buff *) (long) r1; int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[BPF_LDST_LEN]; void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) @@ -1355,14 +1364,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) * * so check for invalid 'offset' and too large 'len' */ - if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + len))) + if (unlikely(skb_try_make_writable(skb, offset + len))) return -EFAULT; - ptr = skb_header_pointer(skb, offset, len, buf); + ptr = skb_header_pointer(skb, offset, len, sp->buff); if (unlikely(!ptr)) return -EFAULT; @@ -1371,7 +1378,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) memcpy(ptr, from, len); - if (ptr == buf) + if (ptr == sp->buff) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); @@ -1400,7 +1407,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK)) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, to); @@ -1432,9 +1439,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1474,23 +1479,31 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; int offset = (int) r2; __sum16 sum, *ptr; - if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | + BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - - if (unlikely(skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(sum)))) + if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); if (unlikely(!ptr)) return -EFAULT; + if (is_mmzero && !*ptr) + return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1501,6 +1514,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; } + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; if (ptr == &sum) /* skb_store_bits guaranteed to not return -EFAULT here */ skb_store_bits(skb, offset, ptr, sizeof(sum)); @@ -1519,6 +1534,45 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; +static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u64 diff_size = from_size + to_size; + __be32 *from = (__be32 *) (long) r1; + __be32 *to = (__be32 *) (long) r3; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; @@ -1682,6 +1736,13 @@ bool bpf_helper_changes_skb_data(void *func) return true; if (func == bpf_skb_vlan_pop) return true; + if (func == bpf_skb_store_bytes) + return true; + if (func == bpf_l3_csum_replace) + return true; + if (func == bpf_l4_csum_replace) + return true; + return false; } @@ -1849,6 +1910,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d79699c9d1b9..7c7b8739b8b8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -178,15 +178,16 @@ ip: ip_proto = iph->protocol; - if (!dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) - break; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; @@ -208,7 +209,6 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; - __be32 flow_label; ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); @@ -220,18 +220,21 @@ ipv6: if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { - struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; - - key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, - target_container); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); - memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + memcpy(&key_addrs->v6addrs, &iph->saddr, + sizeof(key_addrs->v6addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - flow_label = ip6_flowlabel(iph); - if (flow_label) { + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, @@ -336,8 +339,11 @@ mpls: } case htons(ETH_P_FCOE): - key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); - /* fall through */ + if ((hlen - nhoff) < FCOE_HEADER_LEN) + goto out_bad; + + nhoff += FCOE_HEADER_LEN; + goto out_good; default: goto out_bad; } @@ -396,6 +402,13 @@ ip_proto_again: goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = nhoff; } key_control->flags |= FLOW_DIS_ENCAPSULATION; @@ -437,13 +450,12 @@ ip_proto_again: key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; - if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { - ip_proto = fh->nexthdr; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) goto ip_proto_again; - } } goto out_good; } @@ -730,6 +742,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, { u32 poff = keys->control.thoff; + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 299cfc24d888..669ecc9f884e 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -27,6 +27,31 @@ #include <net/rtnetlink.h> #include <net/ip6_fib.h> +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; @@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + rcu_read_unlock(); + request_module("rtnl-lwt-%s", encap_type_str); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + } + } +#endif if (likely(ops && ops->build_state)) ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b6c8a6629b39..2b3f76fe65f4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -29,7 +29,6 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; -static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev, return restart_syscall(); if (netif_running(netdev)) { - struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; - switch (cmd.duplex) { + + switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; @@ -574,6 +576,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] = { &dev_attr_rx_packets.attr, @@ -599,6 +602,7 @@ static struct attribute *netstat_attrs[] = { &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, NULL }; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 0260c84ed83c..11fce17274f6 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -9,7 +9,6 @@ * Authors: Thomas Graf <tgraf@suug.ch> */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f1efbc39ef6b..2ec86fc552df 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d735e854f916..62737f437c8e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; + + a->rx_nohandler = b->rx_nohandler; } static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) @@ -1412,6 +1414,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, }; +static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +{ + const struct rtnl_link_ops *ops = NULL; + struct nlattr *linfo[IFLA_INFO_MAX + 1]; + + if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0) + return NULL; + + if (linfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } + + return ops; +} + +static bool link_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool link_kind_filtered(const struct net_device *dev, + const struct rtnl_link_ops *kind_ops) +{ + if (kind_ops && dev->rtnl_link_ops != kind_ops) + return true; + + return false; +} + +static bool link_dump_filtered(struct net_device *dev, + int master_idx, + const struct rtnl_link_ops *kind_ops) +{ + if (link_master_filtered(dev, master_idx) || + link_kind_filtered(dev, kind_ops)) + return true; + + return false; +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -1421,6 +1475,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; + const struct rtnl_link_ops *kind_ops = NULL; + unsigned int flags = NLM_F_MULTI; + int master_idx = 0; int err; int hdrlen; @@ -1443,18 +1500,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + if (tb[IFLA_MASTER]) + master_idx = nla_get_u32(tb[IFLA_MASTER]); + + if (tb[IFLA_LINKINFO]) + kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); + + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI, + flags, ext_filter_mask); /* If we ran out of room on the first message, * we're in trouble diff --git a/net/core/scm.c b/net/core/scm.c index 14596fb37172..2696aefdc148 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fplp = fpl; fpl->count = 0; fpl->max = SCM_MAX_FD; + fpl->user = NULL; } fpp = &fpl->fp[fpl->count]; @@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fpp++ = file; fpl->count++; } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + return num; } @@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm) scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); + free_uid(fpl->user); kfree(fpl); } } @@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2df375ec9c2..488566b09c6d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,8 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); /** * skb_panic - private function for out-of-line support @@ -347,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +#define NAPI_SKB_CACHE_SIZE 64 + +struct napi_alloc_cache { + struct page_frag_cache page; + size_t skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -378,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(nc, fragsz, gfp_mask); + return __alloc_page_frag(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -474,7 +484,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; @@ -494,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(nc, len, gfp_mask); + data = __alloc_page_frag(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -505,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->pfmemalloc) + if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -747,6 +757,73 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +void __kfree_skb_flush(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* flush skb_cache if containing objects */ + if (nc->skb_count) { + kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, + nc->skb_cache); + nc->skb_count = 0; + } +} + +static inline void _kfree_skb_defer(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* drop skb->head and call any destructors for packet */ + skb_release_all(skb); + + /* record skb to CPU local list */ + nc->skb_cache[nc->skb_count++] = skb; + +#ifdef CONFIG_SLUB + /* SLUB writes into objects when freeing */ + prefetchw(skb); +#endif + + /* flush skb_cache if it is filled */ + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, + nc->skb_cache); + nc->skb_count = 0; + } +} +void __kfree_skb_defer(struct sk_buff *skb) +{ + _kfree_skb_defer(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + if (unlikely(!skb)) + return; + + /* if budget is 0 assume netpoll w/ IRQs disabled */ + if (unlikely(!budget)) { + dev_consume_skb_irq(skb); + return; + } + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); + return; + } + + _kfree_skb_defer(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + /* Make sure a field is enclosed inside headers_start/headers_end section */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ @@ -3004,8 +3081,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); - csum = !head_skb->encap_hdr_csum && - !!can_checksum_protocol(features, proto); + csum = !!can_checksum_protocol(features, proto); headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3098,13 +3174,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg && !nskb->remcsum_offload) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); + if (!sg) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, len), + len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; continue; } @@ -3170,12 +3248,19 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum && !nskb->remcsum_offload) { - nskb->csum = skb_checksum(nskb, doffset, - nskb->len - doffset, 0); - nskb->ip_summed = CHECKSUM_NONE; + if (!csum) { + if (skb_has_shared_frag(nskb)) { + err = __skb_linearize(nskb); + if (err) + goto err; + } + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + skb_headroom(nskb) + doffset; } } while ((offset += len) < head_skb->len); @@ -4413,9 +4498,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) skb->mac_len += VLAN_HLEN; __skb_pull(skb, offset); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 51270238e269..4493ff820c2c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap) } EXPORT_SYMBOL(sk_net_capable); - -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } - - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; -} - -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ - struct proto *proto; - - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); -} -#endif - /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -240,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif - /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -1030,6 +987,10 @@ set_rcvbuf: sk->sk_incoming_cpu = val; break; + case SO_CNX_ADVICE: + if (val == 1) + dst_negative_advice(sk); + break; default: ret = -ENOPROTOOPT; break; @@ -1507,12 +1468,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); -} - /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1580,6 +1535,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk = NULL; goto out; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; newsk->sk_priority = 0; @@ -1607,7 +1563,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - sk_update_clone(sk, newsk); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); @@ -2089,27 +2046,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; - int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + allocated = sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + goto suppress_allocation; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -2158,6 +2115,9 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2173,6 +2133,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 1df98c557440..e92b759d906c 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -93,10 +93,17 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2) { struct sock_reuseport *reuse; + if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { + int err = reuseport_alloc(sk2); + + if (err) + return err; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)), diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 95b6139d710c..a6beb7b6ae55 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,7 @@ static int zero = 0; static int one = 1; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "max_skb_frags", + .data = &sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &max_skb_frags, + }, { } }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5684e14932bd..b5672e5fe649 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -802,7 +802,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) } lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, + sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " @@ -824,26 +824,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v4_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9c6d0508e63a..4663a01d5039 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -668,7 +668,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, inet6_iif(skb)); if (!sk) { @@ -691,26 +691,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v6_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } @@ -993,7 +993,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 40b9ca72aae3..27bf03d11670 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -201,47 +201,6 @@ out: return 0; } -static int dsa_bridge_check_vlan_range(struct dsa_switch *ds, - const struct net_device *bridge, - u16 vid_begin, u16 vid_end) -{ - struct dsa_slave_priv *p; - struct net_device *dev, *vlan_br; - DECLARE_BITMAP(members, DSA_MAX_PORTS); - DECLARE_BITMAP(untagged, DSA_MAX_PORTS); - u16 vid; - int member, err; - - if (!ds->drv->vlan_getnext || !vid_begin) - return -EOPNOTSUPP; - - vid = vid_begin - 1; - - do { - err = ds->drv->vlan_getnext(ds, &vid, members, untagged); - if (err) - break; - - if (vid > vid_end) - break; - - member = find_first_bit(members, DSA_MAX_PORTS); - if (member == DSA_MAX_PORTS) - continue; - - dev = ds->ports[member]; - p = netdev_priv(dev); - vlan_br = p->bridge_dev; - if (vlan_br == bridge) - continue; - - netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid); - return -EOPNOTSUPP; - } while (vid < vid_end); - - return err == -ENOENT ? 0 : err; -} - static int dsa_slave_port_vlan_add(struct net_device *dev, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans) @@ -254,15 +213,6 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) return -EOPNOTSUPP; - /* If the requested port doesn't belong to the same bridge as - * the VLAN members, fallback to software VLAN (hopefully). - */ - err = dsa_bridge_check_vlan_range(ds, p->bridge_dev, - vlan->vid_begin, - vlan->vid_end); - if (err) - return err; - err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); if (err) return err; @@ -293,41 +243,11 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - DECLARE_BITMAP(members, DSA_MAX_PORTS); - DECLARE_BITMAP(untagged, DSA_MAX_PORTS); - u16 pvid, vid = 0; - int err; - - if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get) - return -EOPNOTSUPP; - - err = ds->drv->port_pvid_get(ds, p->port, &pvid); - if (err) - return err; - - for (;;) { - err = ds->drv->vlan_getnext(ds, &vid, members, untagged); - if (err) - break; - - if (!test_bit(p->port, members)) - continue; - - memset(vlan, 0, sizeof(*vlan)); - vlan->vid_begin = vlan->vid_end = vid; - if (vid == pvid) - vlan->flags |= BRIDGE_VLAN_INFO_PVID; + if (ds->drv->port_vlan_dump) + return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); - if (test_bit(p->port, untagged)) - vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED; - - err = cb(&vlan->obj); - if (err) - break; - } - - return err == -ENOENT ? 0 : err; + return -EOPNOTSUPP; } static int dsa_slave_port_fdb_add(struct net_device *dev, @@ -385,31 +305,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -/* Return a bitmask of all ports being currently bridged within a given bridge - * device. Note that on leave, the mask will still return the bitmask of ports - * currently bridged, prior to port removal, and this is exactly what we want. - */ -static u32 dsa_slave_br_port_mask(struct dsa_switch *ds, - struct net_device *bridge) -{ - struct dsa_slave_priv *p; - unsigned int port; - u32 mask = 0; - - for (port = 0; port < DSA_MAX_PORTS; port++) { - if (!dsa_is_port_initialized(ds, port)) - continue; - - p = netdev_priv(ds->ports[port]); - - if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT && - p->bridge_dev == bridge) - mask |= 1 << port; - } - - return mask; -} - static int dsa_slave_stp_update(struct net_device *dev, u8 state) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -422,6 +317,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) return ret; } +static int dsa_slave_vlan_filtering(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + if (ds->drv->port_vlan_filtering) + return ds->drv->port_vlan_filtering(ds, p->port, + attr->u.vlan_filtering); + + return 0; +} + static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -438,6 +351,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = ds->drv->port_stp_update(ds, p->port, attr->u.stp_state); break; + case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: + ret = dsa_slave_vlan_filtering(dev, attr, trans); + break; default: ret = -EOPNOTSUPP; break; @@ -533,8 +449,7 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, p->bridge_dev = br; if (ds->drv->port_join_bridge) - ret = ds->drv->port_join_bridge(ds, p->port, - dsa_slave_br_port_mask(ds, br)); + ret = ds->drv->port_join_bridge(ds, p->port, br); return ret; } @@ -547,8 +462,7 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev) if (ds->drv->port_leave_bridge) - ret = ds->drv->port_leave_bridge(ds, p->port, - dsa_slave_br_port_mask(ds, p->bridge_dev)); + ret = ds->drv->port_leave_bridge(ds, p->port); p->bridge_dev = NULL; @@ -1194,7 +1108,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - phy_disconnect(p->phy); ds->ports[port] = NULL; free_netdev(slave_dev); return ret; @@ -1205,6 +1118,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); + unregister_netdev(slave_dev); free_netdev(slave_dev); return ret; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 103871784e50..66dff5e3d772 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL(eth_header); */ u32 eth_get_headlen(void *data, unsigned int len) { + const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; struct flow_keys keys; @@ -134,7 +135,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len, 0)) + sizeof(*eth), len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a548be247e15..e0bd013a1e5e 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); -static void raw_hash(struct sock *sk) +static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&raw_lock); + + return 0; } static void raw_unhash(struct sock *sk) @@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk) return container_of(sk, struct dgram_sock, sk); } -static void dgram_hash(struct sock *sk) +static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&dgram_lock); + + return 0; } static void dgram_unhash(struct sock *sk) @@ -1026,8 +1030,13 @@ static int ieee802154_create(struct net *net, struct socket *sock, /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); - if (sk->sk_prot->hash) - sk->sk_prot->hash(sk); + if (sk->sk_prot->hash) { + rc = sk->sk_prot->hash(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c22920525e5d..238225b0c970 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate + select DST_CACHE default n config NET_IPGRE @@ -353,6 +354,7 @@ config INET_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. @@ -404,14 +406,6 @@ config INET_XFRM_MODE_BEET If unsure, say Y. -config INET_LRO - tristate "Large Receive Offload (ipv4/tcp)" - default y - ---help--- - Support for Large Receive Offload (ipv4/tcp). - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c29809f765dc..bfa133691cde 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o -obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o @@ -56,7 +55,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5c5db6636704..209d1ed28954 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -370,7 +370,11 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { @@ -1091,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p) } EXPORT_SYMBOL(inet_unregister_protosw); -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; - static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1127,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (new_saddr == old_saddr) return 0; - if (sysctl_ip_dynaddr > 1) { + if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } @@ -1142,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __sk_prot_rehash(sk); - return 0; + return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) @@ -1183,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk) * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ - if (!sysctl_ip_dynaddr || + if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59b3e0e8fd51..c102eb5ac55c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -735,6 +735,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out; + /* + * For some 802.11 wireless deployments (and possibly other networks), + * there will be an ARP proxy and gratuitous ARP frames are attacks + * and thus should not be accepted. + */ + if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) + goto out; + /* * Special case: We must set Frame Relay source Q.922 address */ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cebd9d31e65a..8c3df2ccba45 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1194,6 +1194,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; struct in_device *in_dev; struct net *net = dev_net(dev); + int master_idx; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1214,12 +1215,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (addr) goto out_unlock; no_in_dev: + master_idx = l3mdev_master_ifindex_rcu(dev); + + /* For VRFs, the VRF device takes the place of the loopback device, + * with addresses on it being preferred. Note in such cases the + * loopback device will be among the devices that fail the master_idx + * equality check in the loop below. + */ + if (master_idx && + (dev = dev_get_by_index_rcu(net, master_idx)) && + (in_dev = __in_dev_get_rcu(dev))) { + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock; + } + } endfor_ifa(in_dev); + } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; @@ -1847,7 +1869,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -2185,6 +2207,8 @@ static struct devinet_sysctl_table { "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), + DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, + "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2192,6 +2216,8 @@ static struct devinet_sysctl_table { "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), + DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, + "drop_unicast_in_l2_multicast"), }, }; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e5936c10d..d07fc076bea0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head) if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->tn_bits <= TNODE_KMALLOC_MAX) - kfree(n); else - vfree(n); + kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) @@ -1396,9 +1394,10 @@ found: struct fib_info *fi = fa->fa_info; int nhsel, err; - if ((index >= (1ul << fa->fa_slen)) && - ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) - continue; + if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { + if (index >= (1ul << fa->fa_slen)) + continue; + } if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; if (fi->fib_dead) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 976f0dcf6991..88dab0c1670c 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -774,7 +774,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); - uh->check = 0; udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); @@ -784,11 +783,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; __be16 sport; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -804,8 +803,8 @@ EXPORT_SYMBOL(fou_build_header); int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; __be16 sport; @@ -814,7 +813,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { - csum = false; optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; @@ -822,7 +820,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, optlen += need_priv ? GUE_LEN_PRIV : 0; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5a8ee3282550..47f4c544c916 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -18,15 +18,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); - netdev_features_t enc_features; - int ghl; - struct gre_base_hdr *greh; u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; __be16 protocol = skb->protocol; - int tnl_hlen; - bool csum; + u16 mac_len = skb->mac_len; + int gre_offset, outer_hlen; + bool need_csum, ufo; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -43,74 +41,74 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, if (!skb->encapsulation) goto out; - if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; - greh = (struct gre_base_hdr *)skb_transport_header(skb); - - ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); - if (unlikely(ghl < sizeof(*greh))) + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; - csum = !!(greh->flags & GRE_CSUM); - if (csum) - skb->encap_hdr_csum = 1; - /* setup inner skb. */ - skb->protocol = greh->protocol; skb->encapsulation = 0; - - if (unlikely(!pskb_may_pull(skb, ghl))) - goto out; - - __skb_pull(skb, ghl); + __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = skb->inner_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + skb->encap_hdr_csum = need_csum; + + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags based + * on the fact that we will be computing our checksum in software. + */ + if (ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = skb_mac_gso_segment(skb, enc_features); + segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { - skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; } + outer_hlen = skb_tnl_header_len(skb); + gre_offset = outer_hlen - tnl_hlen; skb = segs; - tnl_hlen = skb_tnl_header_len(skb); do { - __skb_push(skb, ghl); - if (csum) { - __be32 *pcsum; - - if (skb_has_shared_frag(skb)) { - int err; - - err = __skb_linearize(skb); - if (err) { - kfree_skb_list(segs); - segs = ERR_PTR(err); - goto out; - } - } - - skb_reset_transport_header(skb); + struct gre_base_hdr *greh; + __be32 *pcsum; - greh = (struct gre_base_hdr *) - skb_transport_header(skb); - pcsum = (__be32 *)(greh + 1); - *pcsum = 0; - *(__sum16 *)pcsum = gso_make_checksum(skb, 0); + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; } - __skb_push(skb, tnl_hlen - ghl); - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + skb->mac_len = mac_len; + skb->protocol = protocol; + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); - skb->mac_len = mac_len; - skb->protocol = protocol; + skb_set_transport_header(skb, gre_offset); + + if (!need_csum) + continue; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + pcsum = (__be32 *)(greh + 1); + + *pcsum = 0; + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } while ((skb = skb->next)); out: return segs; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 05e4cba14162..2aea9f1a2a31 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -107,12 +107,6 @@ #include <linux/seq_file.h> #endif -#define IP_MAX_MEMBERSHIPS 20 -#define IP_MAX_MSF 10 - -/* IGMP reports for link-local multicast groups are enabled by default */ -int sysctl_igmp_llm_reports __read_mostly = 1; - #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -433,6 +427,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; + struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -440,7 +435,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; - if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || @@ -543,6 +538,7 @@ empty_source: static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; + struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { @@ -551,7 +547,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -687,7 +683,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -766,9 +762,10 @@ static void igmp_ifc_timer_expire(unsigned long data) static void igmp_ifc_event(struct in_device *in_dev) { + struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -858,12 +855,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; + struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return false; rcu_read_lock(); @@ -887,6 +885,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, __be32 group = ih->group; int max_delay; int mark = 0; + struct net *net = dev_net(in_dev->dev); if (len == 8) { @@ -972,7 +971,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&im->lock); if (im->tm_running) @@ -1088,6 +1087,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc; + struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not @@ -1102,7 +1102,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1187,6 +1187,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); int reporter; #endif @@ -1198,7 +1199,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; reporter = im->reporter; @@ -1223,6 +1224,9 @@ static void igmp_group_dropped(struct ip_mc_list *im) static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif if (im->loaded == 0) { im->loaded = 1; @@ -1232,7 +1236,7 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; if (in_dev->dead) @@ -1245,7 +1249,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1314,6 +1318,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev, void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { struct ip_mc_list *im; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); @@ -1340,7 +1347,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = sysctl_igmp_qrv; + im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1533,6 +1540,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; + struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); @@ -1540,7 +1548,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; /* a failover is happening and switches @@ -1639,6 +1647,9 @@ void ip_mc_down(struct in_device *in_dev) void ip_mc_init_dev(struct in_device *in_dev) { +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST @@ -1646,7 +1657,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1657,11 +1668,14 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); @@ -1727,11 +1741,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) /* * Join a socket to a group */ -int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; -int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; -#ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; -#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1756,6 +1765,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; + struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ @@ -1766,7 +1776,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1824,12 +1834,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1996,6 +2007,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif @@ -2007,7 +2019,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2074,7 +2086,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) count++; } err = -ENOBUFS; - if (count >= sysctl_igmp_max_memberships) + if (count >= net->ipv4.sysctl_igmp_max_memberships) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) @@ -2246,7 +2258,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } /* else, add a new source to the filter */ - if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto done; } @@ -2919,6 +2931,12 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } + /* Sysctl initialization */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 46b9c887bede..bc5196ea1bdf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/sock_reuseport.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || - (sk2->sk_state != TCP_TIME_WAIT && + rcu_access_pointer(sk->sk_reuseport_cb) || + (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, attempts = 5, port = snum; + int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret, attempts = 5; struct net *net = sock_net(sk); - int smallest_size = -1, smallest_rover; + int i, low, high, attempt_half; + struct inet_bind_bucket *tb; kuid_t uid = sock_i_uid(sk); - int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; + u32 remaining, offset; - local_bh_disable(); - if (!snum) { - int remaining, rover, low, high; + if (port) { +have_port: + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; + goto tb_not_found; + } again: - inet_get_local_port_range(net, &low, &high); - if (attempt_half) { - int half = low + ((high - low) >> 1); - - if (attempt_half == 1) - high = half; - else - low = half; - } - remaining = (high - low) + 1; - smallest_rover = rover = prandom_u32() % remaining + low; - - smallest_size = -1; - do { - if (inet_is_local_reserved_port(net, rover)) - goto next_nolock; - head = &hashinfo->bhash[inet_bhashfn(net, rover, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == rover) { - if (((tb->fastreuse > 0 && - sk->sk_reuse && - sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_rover = rover; - } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = rover; - goto tb_found; - } - goto next; + attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; +other_half_scan: + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + if (high - low < 4) + attempt_half = 0; + if (attempt_half) { + int half = low + (((high - low) >> 2) << 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; + + offset = prandom_u32() % remaining; + /* __inet_hash_connect() favors ports having @low parity + * We do the opposite to not pollute connect() users. + */ + offset |= 1U; + smallest_size = -1; + smallest_port = low; /* avoid compiler warning */ + +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (((tb->fastreuse > 0 && reuse) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_port = port; } - break; - next: - spin_unlock(&head->lock); - next_nolock: - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (remaining <= 0) { - if (smallest_size != -1) { - snum = smallest_rover; - goto have_snum; - } - if (attempt_half == 1) { - /* OK we now try the upper half of the range */ - attempt_half = 2; - goto again; + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) + goto tb_found; + goto next_port; } - goto fail; - } - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { -have_snum: - head = &hashinfo->bhash[inet_bhashfn(net, snum, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == snum) - goto tb_found; + goto tb_not_found; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } + + if (smallest_size != -1) { + port = smallest_port; + goto have_port; + } + offset--; + if (!(offset & 1)) + goto other_parity_scan; + + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto other_half_scan; } - tb = NULL; - goto tb_not_found; + return ret; + +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) + goto fail_unlock; tb_found: if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && + !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) { + smallest_size == -1) goto success; - } else { - ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { - if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size != -1 && --attempts >= 0) { - spin_unlock(&head->lock); - goto again; - } - - goto fail_unlock; + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { + if ((reuse || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + smallest_size != -1 && --attempts >= 0) { + spin_unlock_bh(&head->lock); + goto again; } + goto fail_unlock; } - } -tb_not_found: - ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, - net, head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else + if (!reuse) tb->fastreuse = 0; + if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) + tb->fastreuseport = 0; + } else { + tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; tb->fastuid = uid; - } else - tb->fastreuseport = 0; - } else { - if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - if (tb->fastreuseport && - (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + } else { tb->fastreuseport = 0; + } } success: if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); @@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); #define AF_INET_FAMILY(fam) true #endif -/* Only thing we need from tcp.h */ -extern int sysctl_tcp_synack_retries; - - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; + struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; int qlen, expire = 0, resend = 0; @@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data) if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); - return 0; + if (likely(!err)) + return 0; } sk->sk_state = TCP_CLOSE; - return -EADDRINUSE; + return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); @@ -789,14 +783,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, reqsk_put(req); } -void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, - struct sock *child) +struct sock *inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); + child = NULL; } else { req->sk = child; req->dl_next = NULL; @@ -808,6 +804,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); + return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); @@ -817,11 +814,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, if (own_req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; } /* Too bad, another child took ownership of the request, undo. */ bh_unlock_sock(child); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8bb8e7ad8548..50c0d96b8441 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -357,17 +357,24 @@ struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *sk; if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) - sk = inet6_lookup(net, hashinfo, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, hashinfo, NULL, 0, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } #endif else return ERR_PTR(-EINVAL); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc5980797fc..bc68eced0105 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -229,6 +233,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -246,11 +259,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -506,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->inet_num; + struct inet_timewait_sock *tw = NULL; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; + int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + struct inet_bind_bucket *tb; + u32 remaining, offset; + int ret, i, low, high; + static u32 hint; + + if (port) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + inet_ehash_nolisten(sk, NULL); + spin_unlock_bh(&head->lock); + return 0; + } + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = check_established(death_row, sk, port, NULL); + local_bh_enable(); + return ret; + } - if (!snum) { - int i, remaining, low, high, port; - static u32 hint; - u32 offset = hint + port_offset; - struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; - inet_get_local_port_range(net, &low, &high); - remaining = (high - low) + 1; + offset = (hint + port_offset) % remaining; + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ + offset &= ~1U; +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); - /* By starting with offset being an even number, - * we tend to leave about 50% of ports for other uses, - * like bind(0). + /* Does not bother with rcv_saddr checks, because + * the established check is already unique enough. */ - offset &= ~1; - - local_bh_disable(); - for (i = 0; i < remaining; i++) { - port = low + (i + offset) % remaining; - if (inet_is_local_reserved_port(net, port)) - continue; - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && - tb->port == port) { - if (tb->fastreuse >= 0 || - tb->fastreuseport >= 0) - goto next_port; - WARN_ON(hlist_empty(&tb->owners)); - if (!check_established(death_row, sk, - port, &tw)) - goto ok; + inet_bind_bucket_for_each(tb, &head->chain) { + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; - } + WARN_ON(hlist_empty(&tb->owners)); + if (!check_established(death_row, sk, + port, &tw)) + goto ok; + goto next_port; } - - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - tb->fastreuseport = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += (i + 2) & ~1; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) { + spin_unlock_bh(&head->lock); + return -ENOMEM; } - if (tw) - inet_twsk_bind_unhash(tw, hinfo); - spin_unlock(&head->lock); + tb->fastreuse = -1; + tb->fastreuseport = -1; + goto ok; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } - if (tw) - inet_twsk_deschedule_put(tw); + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; - ret = 0; - goto out; - } + return -EADDRNOTAVAIL; - head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = check_established(death_row, sk, snum, NULL); -out: - local_bh_enable(); - return ret; +ok: + hint += i + 2; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->inet_sport = htons(port); + inet_ehash_nolisten(sk, (struct sock *)tw); } + if (tw) + inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + if (tw) + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c deleted file mode 100644 index f17ea49b28fb..000000000000 --- a/net/ipv4/inet_lro.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - * linux/net/ipv4/inet_lro.c - * - * Large Receive Offload (ipv4 / tcp) - * - * (C) Copyright IBM Corp. 2007 - * - * Authors: - * Jan-Bernd Themann <themann@de.ibm.com> - * Christoph Raisch <raisch@de.ibm.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/module.h> -#include <linux/if_vlan.h> -#include <linux/inet_lro.h> -#include <net/checksum.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); -MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); - -#define TCP_HDR_LEN(tcph) (tcph->doff << 2) -#define IP_HDR_LEN(iph) (iph->ihl << 2) -#define TCP_PAYLOAD_LENGTH(iph, tcph) \ - (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) - -#define IPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_W_TIMESTAMP 8 - -#define LRO_MAX_PG_HLEN 64 - -#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } - -/* - * Basic tcp checks whether packet is suitable for LRO - */ - -static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, - int len, const struct net_lro_desc *lro_desc) -{ - /* check ip header: don't aggregate padded frames */ - if (ntohs(iph->tot_len) != len) - return -1; - - if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) - return -1; - - if (iph->ihl != IPH_LEN_WO_OPTIONS) - return -1; - - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || - tcph->rst || tcph->syn || tcph->fin) - return -1; - - if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) - return -1; - - if (tcph->doff != TCPH_LEN_WO_OPTIONS && - tcph->doff != TCPH_LEN_W_TIMESTAMP) - return -1; - - /* check tcp options (only timestamp allowed) */ - if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { - __be32 *topt = (__be32 *)(tcph + 1); - - if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return -1; - - /* timestamp should be in right order */ - topt++; - if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), - ntohl(*topt))) - return -1; - - /* timestamp reply should not be zero */ - topt++; - if (*topt == 0) - return -1; - } - - return 0; -} - -static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) -{ - struct iphdr *iph = lro_desc->iph; - struct tcphdr *tcph = lro_desc->tcph; - __be32 *p; - __wsum tcp_hdr_csum; - - tcph->ack_seq = lro_desc->tcp_ack; - tcph->window = lro_desc->tcp_window; - - if (lro_desc->tcp_saw_tstamp) { - p = (__be32 *)(tcph + 1); - *(p+2) = lro_desc->tcp_rcv_tsecr; - } - - csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); - iph->tot_len = htons(lro_desc->ip_tot_len); - - tcph->check = 0; - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); - lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); - tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - lro_desc->ip_tot_len - - IP_HDR_LEN(iph), IPPROTO_TCP, - lro_desc->data_csum); -} - -static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) -{ - __wsum tcp_csum; - __wsum tcp_hdr_csum; - __wsum tcp_ps_hdr_csum; - - tcp_csum = ~csum_unfold(tcph->check); - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); - - tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - len + TCP_HDR_LEN(tcph), - IPPROTO_TCP, 0); - - return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), - tcp_ps_hdr_csum); -} - -static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - int nr_frags; - __be32 *ptr; - u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - nr_frags = skb_shinfo(skb)->nr_frags; - lro_desc->parent = skb; - lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); - lro_desc->iph = iph; - lro_desc->tcph = tcph; - lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; - lro_desc->tcp_ack = tcph->ack_seq; - lro_desc->tcp_window = tcph->window; - - lro_desc->pkt_aggr_cnt = 1; - lro_desc->ip_tot_len = ntohs(iph->tot_len); - - if (tcph->doff == 8) { - ptr = (__be32 *)(tcph+1); - lro_desc->tcp_saw_tstamp = 1; - lro_desc->tcp_rcv_tsval = *(ptr+1); - lro_desc->tcp_rcv_tsecr = *(ptr+2); - } - - lro_desc->mss = tcp_data_len; - lro_desc->active = 1; - - lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, - tcp_data_len); -} - -static inline void lro_clear_desc(struct net_lro_desc *lro_desc) -{ - memset(lro_desc, 0, sizeof(struct net_lro_desc)); -} - -static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, - struct tcphdr *tcph, int tcp_data_len) -{ - struct sk_buff *parent = lro_desc->parent; - __be32 *topt; - - lro_desc->pkt_aggr_cnt++; - lro_desc->ip_tot_len += tcp_data_len; - lro_desc->tcp_next_seq += tcp_data_len; - lro_desc->tcp_window = tcph->window; - lro_desc->tcp_ack = tcph->ack_seq; - - /* don't update tcp_rcv_tsval, would not work with PAWS */ - if (lro_desc->tcp_saw_tstamp) { - topt = (__be32 *) (tcph + 1); - lro_desc->tcp_rcv_tsecr = *(topt + 2); - } - - lro_desc->data_csum = csum_block_add(lro_desc->data_csum, - lro_tcp_data_csum(iph, tcph, - tcp_data_len), - parent->len); - - parent->len += tcp_data_len; - parent->data_len += tcp_data_len; - if (tcp_data_len > lro_desc->mss) - lro_desc->mss = tcp_data_len; -} - -static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *parent = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb_pull(skb, (skb->len - tcp_data_len)); - parent->truesize += skb->truesize; - - if (lro_desc->last_skb) - lro_desc->last_skb->next = skb; - else - skb_shinfo(parent)->frag_list = skb; - - lro_desc->last_skb = skb; -} - - -static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, - struct iphdr *iph, - struct tcphdr *tcph) -{ - if ((lro_desc->iph->saddr != iph->saddr) || - (lro_desc->iph->daddr != iph->daddr) || - (lro_desc->tcph->source != tcph->source) || - (lro_desc->tcph->dest != tcph->dest)) - return -1; - return 0; -} - -static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_arr, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc = NULL; - struct net_lro_desc *tmp; - int max_desc = lro_mgr->max_desc; - int i; - - for (i = 0; i < max_desc; i++) { - tmp = &lro_arr[i]; - if (tmp->active) - if (!lro_check_tcp_conn(tmp, iph, tcph)) { - lro_desc = tmp; - goto out; - } - } - - for (i = 0; i < max_desc; i++) { - if (!lro_arr[i].active) { - lro_desc = &lro_arr[i]; - goto out; - } - } - - LRO_INC_STATS(lro_mgr, no_desc); -out: - return lro_desc; -} - -static void lro_flush(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_desc) -{ - if (lro_desc->pkt_aggr_cnt > 1) - lro_update_tcp_ip_header(lro_desc); - - skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(lro_desc->parent); - else - netif_rx(lro_desc->parent); - - LRO_INC_STATS(lro_mgr, flushed); - lro_clear_desc(lro_desc); -} - -static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, - void *priv) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - u64 flags; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_skb_header || - lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) - goto out; - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) - goto out; - - skb->ip_summed = lro_mgr->ip_summed_aggr; - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return 0; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) - goto out2; - - lro_add_packet(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return 0; - -out2: /* send aggregated SKBs to stack */ - lro_flush(lro_mgr, lro_desc); - -out: - return 1; -} - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - void *priv) -{ - if (__lro_proc_skb(lro_mgr, skb, priv)) { - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); - } -} -EXPORT_SYMBOL(lro_receive_skb); - -void lro_flush_all(struct net_lro_mgr *lro_mgr) -{ - int i; - struct net_lro_desc *lro_desc = lro_mgr->lro_arr; - - for (i = 0; i < lro_mgr->max_desc; i++) { - if (lro_desc[i].active) - lro_flush(lro_mgr, &lro_desc[i]); - } -} -EXPORT_SYMBOL(lro_flush_all); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 3f00810b7288..efbd47d1a531 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -54,8 +54,6 @@ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64; static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb @@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; - qp->peer = sysctl_ipfrag_max_dist ? + qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } @@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = sysctl_ipfrag_max_dist; + unsigned int max = qp->q.net->max_dist; unsigned int start, end; int rc; @@ -661,6 +659,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); @@ -748,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "ipfrag_max_dist", + .data = &init_net.ipv4.frags.max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, { } }; @@ -761,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero - }, { } }; @@ -789,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[3].data = &net->ipv4.frags.max_dist; } hdr = register_net_sysctl(net, "net/ipv4", table); @@ -864,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.frags.max_dist = 64; + res = inet_frags_init_net(&net->ipv4.frags); if (res) return res; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7c51c4e1661f..202437d6087b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -238,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return iptunnel_pull_header(skb, hdr_len, tpi->proto); + return iptunnel_pull_header(skb, hdr_len, tpi->proto, false); } static void ipgre_err(struct sk_buff *skb, u32 info, @@ -440,6 +440,17 @@ drop: return 0; } +static __sum16 gre_checksum(struct sk_buff *skb) +{ + __wsum csum; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csum = lco_csum(skb); + else + csum = skb_checksum(skb, 0, skb->len, 0); + return csum_fold(csum); +} + static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { @@ -467,8 +478,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); + *(__sum16 *)ptr = gre_checksum(skb); } } } @@ -493,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool csum) { - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); + return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static struct rtable *gre_get_rt(struct sk_buff *skb, @@ -531,9 +540,16 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; + rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) : + NULL; + if (!rt) { + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + if (!skb->mark) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl.saddr); + } tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); @@ -1054,8 +1070,9 @@ static const struct net_device_ops gre_tap_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); - dev->netdev_ops = &gre_tap_netdev_ops; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } @@ -1240,6 +1257,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, err = ipgre_newlink(net, dev, tb, NULL); if (err < 0) goto out; + + /* openvswitch users expect packet sizes to be unrestricted, + * so set the largest MTU we can. + */ + err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); + if (err) + goto out; + return dev; out: free_netdev(dev); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b1209b63381f..e3d782746d9d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -308,15 +308,15 @@ drop: return true; } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); - static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) { + if (net->ipv4.sysctl_ip_early_demux && + !skb_dst(skb) && + !skb->sk && + !ip_is_fragment(iph)) { const struct net_protocol *ipprot; int protocol = iph->protocol; @@ -359,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); - } else if (rt->rt_type == RTN_BROADCAST) + } else if (rt->rt_type == RTN_BROADCAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + } else if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + /* RFC 1122 3.3.6: + * + * When a host sends a datagram to a link-layer broadcast + * address, the IP destination address MUST be a legal IP + * broadcast or IP multicast address. + * + * A host SHOULD silently discard a datagram that is received + * via a link-layer broadcast (see Section 2.4) but does not + * specify an IP multicast or broadcast destination address. + * + * This doesn't explicitly say L2 *broadcast*, but broadcast is + * in a way a form of multicast and the most common use case for + * this is 802.11 protecting against cross-station spoofing (the + * so-called "hole-196" attack) so do it for both. + */ + if (in_dev && + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) + goto drop; + } return dst_input(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 512a44778cf2..f734c42acdaf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -79,9 +79,6 @@ #include <linux/netlink.h> #include <linux/tcp.h> -int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; -EXPORT_SYMBOL(sysctl_ip_default_ttl); - static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, @@ -239,6 +236,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * from host network stack. */ features = netif_skb_features(skb); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5f73a7c03e27..035ad645a8d9 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + + /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) @@ -571,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); int val = 0, err; bool needs_rtnl = setsockopt_needs_rtnl(optname); @@ -910,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > sysctl_igmp_max_msf) { + msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) { kfree(msf); err = -ENOBUFS; break; @@ -1065,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sysctl_igmp_max_msf) { + gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto mc_msf_out; } @@ -1340,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->tos; break; case IP_TTL: + { + struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : + net->ipv4.sysctl_ip_default_ttl : inet->uc_ttl); break; + } case IP_HDRINCL: val = inet->hdrincl; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c7bd72e9b544..dff8a05739a2 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst, __be32 saddr) -{ - struct dst_entry *old_dst; - - dst_clone(dst); - old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); - dst_release(old_dst); - idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, - struct dst_entry *dst, __be32 saddr) -{ - __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ - tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ - int i; - - for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, - u32 cookie, __be32 *saddr) -{ - struct ip_tunnel_dst *idst; - struct dst_entry *dst; - - rcu_read_lock(); - idst = raw_cpu_ptr(t->dst_cache); - dst = rcu_dereference(idst->dst); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, cookie)) { - *saddr = idst->saddr; - } else { - tunnel_dst_reset(t); - dst_release(dst); - dst = NULL; - } - } - rcu_read_unlock(); - return (struct rtable *)dst; -} - static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { @@ -381,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -729,7 +675,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : + NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -739,7 +686,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { @@ -836,7 +784,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } @@ -943,17 +891,31 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); -int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); + int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + if (new_mtu < 68) return -EINVAL; + + if (new_mtu > max_mtu) { + if (strict) + return -EINVAL; + + new_mtu = max_mtu; + } + dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + return __ip_tunnel_change_mtu(dev, new_mtu, true); +} EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) @@ -961,7 +923,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1155,15 +1117,15 @@ int ip_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1193,7 +1155,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(itn, netdev_priv(dev)); - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 859d415c0b2d..eaca2449a09a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -109,13 +110,10 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->protocol = inner_proto; } - nf_reset(skb); - secpath_reset(skb); skb_clear_hash_if_not_l4(skb); - skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb, xnet); return 0; } EXPORT_SYMBOL_GPL(iptunnel_pull_header); @@ -148,7 +146,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, int gso_type_mask) { int err; @@ -166,20 +163,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } - /* If packet is not gso and we are resolving any partial checksum, - * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL - * on the outer header without confusing devices that implement - * NETIF_F_IP_CSUM with encapsulation. - */ - if (csum_help) - skb->encapsulation = 0; - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; + /* We clear encapsulation here to prevent badly-written + * drivers potentially deciding to offload an inner checksum + * if we set CHECKSUM_PARTIAL on the outer header. + * This should go away when the drivers are all fixed. + */ + skb->encapsulation = 0; + } return skb; error: diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 67f7c9de0b16..2ed9dd2b5f2f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,7 +143,11 @@ static char dhcp_client_identifier[253] __initdata; /* Persistent data: */ +#ifdef IPCONFIG_DYNAMIC static int ic_proto_used; /* Protocol used, if any */ +#else +#define ic_proto_used 0 +#endif static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4044da61e747..ec51d02166de 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb) if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 5fdc556514ba..7b8fbb352877 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -21,6 +21,7 @@ static struct iphdr * synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; + struct net *net = sock_net(skb->sk); skb_reset_network_header(skb); iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); @@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = sysctl_ip_default_ttl; + iph->ttl = net->ipv4.sysctl_ip_default_ttl; iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 6fb869f646bf..a04dee536b8e 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -27,8 +27,6 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, { int err; - skb_orphan(skb); - local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c117b21b937d..76dce90c4581 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ + + return 0; } void ping_unhash(struct sock *sk) @@ -746,8 +748,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3abd9d7a3adf..9f665b63a927 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bc35f1842512..8d22de74080c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk) sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + + return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); @@ -547,8 +549,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(net, msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); goto out; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85f184e429c6..02c62299d717 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ @@ -755,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, 0); + 0, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1556,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe, __rcu **fnhe_p; + u32 hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + + hash = rcu_dereference_protected(nh->nh_exceptions, + lockdep_is_held(&fnhe_lock)); + hash += hval; + + fnhe_p = &hash->chain; + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); + while (fnhe) { + if (fnhe->fnhe_daddr == daddr) { + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); + fnhe_flush_routes(fnhe); + kfree_rcu(fnhe, rcu); + break; + } + fnhe_p = &fnhe->fnhe_next; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, + lockdep_is_held(&fnhe_lock)); + } + + spin_unlock_bh(&fnhe_lock); +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1609,11 +1640,20 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe) + if (fnhe) { rth = rcu_dereference(fnhe->fnhe_rth_input); - else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(&FIB_RES_NH(*res), daddr); + fnhe = NULL; + } else { + goto rt_cache; + } + } + + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); +rt_cache: if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2014,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); - if (fnhe) + if (fnhe) { prth = &fnhe->fnhe_rth_output; - else { - if (unlikely(fl4->flowi4_flags & - FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { - do_cache = false; - goto add; + rth = rcu_dereference(*prth); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(nh, fl4->daddr); + fnhe = NULL; + } else { + goto rt_cache; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); } + + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); rth = rcu_dereference(*prth); + +rt_cache: if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; @@ -2569,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c49020..ba0dcffada3b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,8 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -extern int sysctl_tcp_syncookies; - static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ @@ -307,7 +305,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u8 rcv_wscale; struct flowi4 fl4; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 46ce410703b1..1e1fe6086dd9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,7 +24,6 @@ #include <net/cipso_ipv4.h> #include <net/inet_frag.h> #include <net/ping.h> -#include <net/tcp_memcontrol.h> static int zero; static int one = 1; @@ -284,31 +283,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_default_ttl", - .data = &sysctl_ip_default_ttl, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &ip_ttl_min, - .extra2 = &ip_ttl_max, - }, - { - .procname = "tcp_syn_retries", - .data = &sysctl_tcp_syn_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_syn_retries_min, - .extra2 = &tcp_syn_retries_max - }, - { - .procname = "tcp_synack_retries", - .data = &sysctl_tcp_synack_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -323,51 +297,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_early_demux", - .data = &sysctl_ip_early_demux, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ip_dynaddr", - .data = &sysctl_ip_dynaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_retries1", - .data = &sysctl_tcp_retries1, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra2 = &tcp_retr1_max - }, - { - .procname = "tcp_retries2", - .data = &sysctl_tcp_retries2, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_fin_timeout", - .data = &sysctl_tcp_fin_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#ifdef CONFIG_SYN_COOKIES - { - .procname = "tcp_syncookies", - .data = &sysctl_tcp_syncookies, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -416,30 +345,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "igmp_max_memberships", - .data = &sysctl_igmp_max_memberships, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "igmp_max_msf", - .data = &sysctl_igmp_max_msf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#ifdef CONFIG_IP_MULTICAST - { - .procname = "igmp_qrv", - .data = &sysctl_igmp_qrv, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, -#endif - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -461,13 +366,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_orphan_retries", - .data = &sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fack", .data = &sysctl_tcp_fack, .maxlen = sizeof(int), @@ -482,13 +380,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "tcp_reordering", - .data = &sysctl_tcp_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_reordering", .data = &sysctl_tcp_max_reordering, .maxlen = sizeof(int), @@ -518,13 +409,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, }, { - .procname = "tcp_notsent_lowat", - .data = &sysctl_tcp_notsent_lowat, - .maxlen = sizeof(sysctl_tcp_notsent_lowat), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), @@ -846,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_dynaddr", + .data = &init_net.ipv4.sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_early_demux", + .data = &init_net.ipv4.sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_default_ttl", + .data = &init_net.ipv4.sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_ttl_min, + .extra2 = &ip_ttl_max, + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, @@ -935,12 +842,36 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "igmp_link_local_mcast_reports", - .data = &sysctl_igmp_llm_reports, + .data = &init_net.ipv4.sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "igmp_max_memberships", + .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { + .procname = "igmp_max_msf", + .data = &init_net.ipv4.sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &init_net.ipv4.sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -961,6 +892,74 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "tcp_syn_retries", + .data = &init_net.ipv4.sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_syn_retries_min, + .extra2 = &tcp_syn_retries_max + }, + { + .procname = "tcp_synack_retries", + .data = &init_net.ipv4.sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_SYN_COOKIES + { + .procname = "tcp_syncookies", + .data = &init_net.ipv4.sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "tcp_reordering", + .data = &init_net.ipv4.sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retries1", + .data = &init_net.ipv4.sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra2 = &tcp_retr1_max + }, + { + .procname = "tcp_retries2", + .data = &init_net.ipv4.sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_orphan_retries", + .data = &init_net.ipv4.sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fin_timeout", + .data = &init_net.ipv4.sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_notsent_lowat", + .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -989,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_dynaddr = 0; + net->ipv4.sysctl_ip_early_demux = 1; + return 0; err_ports: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7bb1b091efd1..f9faadb42485 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,10 +279,9 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <asm/unaligned.h> #include <net/busy_poll.h> -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; - int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; @@ -405,7 +404,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; u64_stats_init(&tp->syncp); - tp->reordering = sysctl_tcp_reordering; + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); @@ -422,7 +421,8 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - sock_update_memcg(sk); + if (mem_cgroup_sockets_enabled) + sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } @@ -938,7 +938,7 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { + if (!can_coalesce && i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1211,7 +1211,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { + if (i == sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1464,8 +1464,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; @@ -1655,8 +1657,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -2324,6 +2328,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -2520,7 +2525,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > sysctl_tcp_fin_timeout / HZ) + else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) tp->linger2 = 0; else tp->linger2 = val * HZ; @@ -2637,6 +2642,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + int notsent_bytes; + u64 rate64; u32 rate; memset(info, 0, sizeof(*info)); @@ -2702,18 +2709,25 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; rate = READ_ONCE(sk->sk_pacing_rate); - info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_pacing_rate); rate = READ_ONCE(sk->sk_max_pacing_rate); - info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + rate64 = rate != ~0U ? rate : ~0ULL; + put_unaligned(rate64, &info->tcpi_max_pacing_rate); do { start = u64_stats_fetch_begin_irq(&tp->syncp); - info->tcpi_bytes_acked = tp->bytes_acked; - info->tcpi_bytes_received = tp->bytes_received; + put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); + put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; + + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + info->tcpi_notsent_bytes = max(0, notsent_bytes); + + info->tcpi_min_rtt = tcp_min_rtt(tp); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2722,6 +2736,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int val, len; if (get_user(len, optlen)) @@ -2756,12 +2771,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : sysctl_tcp_fin_timeout) / HZ; + val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, @@ -2945,7 +2960,7 @@ static void __tcp_alloc_md5sig_pool(void) struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) + if (IS_ERR(hash)) return; per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 55be6ac70cff..fdb286ddba04 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -124,6 +124,41 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } + +/* If an incoming SYN or SYNACK frame contains a payload and/or FIN, + * queue this additional data / FIN. + */ +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + skb_dst_drop(skb); + __skb_pull(skb, tcp_hdrlen(skb)); + skb_set_owner_r(skb, sk); + + TCP_SKB_CB(skb)->seq++; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; + + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ + tp->bytes_received = skb->len; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_fin(sk); +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct dst_entry *dst, @@ -132,7 +167,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; - u32 end_seq; bool own_req; req->num_retrans = 0; @@ -178,35 +212,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. - * We used to play tricky games with skb_get(). - * With lockless listener, it is a dead end. - * Do not think about it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now, - * (any reason not to?) but no need to queue the skb since - * there is no data. How about SYN+FIN? - */ - end_seq = TCP_SKB_CB(skb)->end_seq; - if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - if (likely(skb2)) { - skb_dst_drop(skb2); - __skb_pull(skb2, tcp_hdrlen(skb)); - skb_set_owner_r(skb2, child); - __skb_queue_tail(&child->sk_receive_queue, skb2); - tp->syn_data_acked = 1; - - /* u64_stats_update_begin(&tp->syncp) not needed here, - * as we certainly are not changing upper 32bit value (0) - */ - tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; - } else { - end_seq = TCP_SKB_CB(skb)->seq + 1; - } - } - tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + + tcp_fastopen_add_skb(child, skb); + + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0003d409fec5..e6e65f79ade8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; -int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; int sysctl_tcp_max_reordering __read_mostly = 300; -EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -126,6 +124,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) +#define REXMIT_NONE 0 /* no loss recovery to do */ +#define REXMIT_LOST 1 /* retransmit packets marked lost */ +#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ + /* Adapt the MSS value used to make delayed ack decision to the * real world. */ @@ -1210,6 +1212,7 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; + tp->delivered += pcount; /* Out-of-order packets delivered */ fack_count += pcount; @@ -1821,8 +1824,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) static void tcp_add_reno_sack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + tp->sacked_out++; tcp_check_reno_reordering(sk, 0); + if (tp->sacked_out > prior_sacked) + tp->delivered++; /* Some out-of-order packet is delivered */ tcp_verify_left_out(tp); } @@ -1834,6 +1841,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ + tp->delivered += max_t(int, acked - tp->sacked_out, 1); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -1873,6 +1881,7 @@ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ @@ -1923,9 +1932,9 @@ void tcp_enter_loss(struct sock *sk) * suggests that the degree of reordering is over-estimated. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= sysctl_tcp_reordering) + tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + net->ipv4.sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -2109,6 +2118,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; + int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2123,7 +2133,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) */ packets_out = tp->packets_out; if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. @@ -2164,8 +2174,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt; - int err; + int cnt, oldcnt, lost; unsigned int mss; /* Use SACK to deduce losses of new sequences sent during recovery */ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; @@ -2205,9 +2214,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = tcp_skb_mss(skb); - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, - mss, GFP_ATOMIC); - if (err < 0) + /* If needed, chop off the prefix to mark as lost. */ + lost = (packets - oldcnt) * mss; + if (lost < skb->len && + tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2366,8 +2376,6 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tp->snd_ssthresh = tp->prior_ssthresh; tcp_ecn_withdraw_cwr(tp); } - } else { - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -2469,14 +2477,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit, int flag) +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - int newly_acked_sacked = prior_unsacked - - (tp->packets_out - tp->sacked_out); if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; @@ -2494,7 +2500,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, } else { sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2539,7 +2546,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) +static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2553,8 +2560,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2664,7 +2669,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, + int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2686,10 +2692,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; - __tcp_push_pending_frames(sk, tcp_current_mss(sk), - TCP_NAGLE_OFF); - if (after(tp->snd_nxt, tp->high_seq)) - return; /* Step 2.b */ + /* Step 2.b. Try send new data (but deferred until cwnd + * is updated in tcp_ack()). Otherwise fall back to + * the conventional recovery. + */ + if (tcp_send_head(sk) && + after(tcp_wnd_end(tp), tp->snd_nxt)) { + *rexmit = REXMIT_NEW; + return; + } tp->frto = 0; } } @@ -2708,12 +2719,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked, int flag) +static bool tcp_try_undo_partial(struct sock *sk, const int acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2728,10 +2738,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * can undo. Otherwise we clock out new packets but do not * mark more packets lost or retransmit more. */ - if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); + if (tp->retrans_out) return true; - } if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; @@ -2750,21 +2758,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * - * Besides that it does CWND reduction, when packet loss is detected - * and changes state of machine. + * Besides that it updates the congestion state when packet loss or ECN + * is detected. But it does not reduce the cwnd, it is done by the + * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - const int prior_unsacked, - bool is_dupack, int flag) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2811,8 +2819,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, /* Use RACK to detect loss */ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) + tcp_rack_mark_lost(sk)) { flag |= FLAG_LOST_RETRANS; + *ack_flag |= FLAG_LOST_RETRANS; + } /* E. Process state. */ switch (icsk->icsk_ca_state) { @@ -2821,7 +2831,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) + if (tcp_try_undo_partial(sk, acked)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2833,7 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack); + tcp_process_loss(sk, flag, is_dupack, rexmit); if (icsk->icsk_ca_state != TCP_CA_Open && !(flag & FLAG_LOST_RETRANS)) return; @@ -2850,7 +2860,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, prior_unsacked); + tcp_try_to_open(sk, flag); return; } @@ -2872,8 +2882,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Kathleen Nichols' algorithm for tracking the minimum value of @@ -2898,7 +2907,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + struct rtt_meas rttm = { + .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), + .ts = now, + }; u32 elapsed; /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ @@ -3095,7 +3107,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, + u32 prior_snd_una, int *acked, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3153,10 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_ORIG_SACK_ACKED; } - if (sacked & TCPCB_SACKED_ACKED) + if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; - else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } else if (tcp_is_sack(tp)) { + tp->delivered += acked_pcount; + if (!tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3265,6 +3280,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif + *acked = pkts_acked; return flag; } @@ -3298,21 +3314,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - if (tcp_in_cwnd_reduction(sk)) - return false; - /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; } +/* The "ultimate" congestion control function that aims to replace the rigid + * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). + * It's called toward the end of processing an ACK with precise rate + * information. All transmission or retransmission are delayed afterwards. + */ +static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, + int flag) +{ + if (tcp_in_cwnd_reduction(sk)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, acked_sacked, flag); + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + tcp_cong_avoid(sk, ack, acked_sacked); + } + tcp_update_pacing_rate(sk); +} + /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ @@ -3508,6 +3539,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags) icsk->icsk_ca_ops->in_ack_event(sk, flags); } +/* Congestion control has updated the cwnd already. So if we're in + * loss recovery then now we do any new sends (for FRTO) or + * retransmits (for CA_Loss or CA_recovery) that make sense. + */ +static void tcp_xmit_recovery(struct sock *sk, int rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (rexmit == REXMIT_NONE) + return; + + if (unlikely(rexmit == 2)) { + __tcp_push_pending_frames(sk, tcp_current_mss(sk), + TCP_NAGLE_OFF); + if (after(tp->snd_nxt, tp->high_seq)) + return; + tp->frto = 0; + } + tcp_xmit_retransmit_queue(sk); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3520,8 +3572,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - const int prior_unsacked = tp->packets_out - tp->sacked_out; + u32 prior_delivered = tp->delivered; int acked = 0; /* Number of packets newly acked */ + int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ sack_state.first_sackt.v64 = 0; @@ -3610,23 +3663,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); - acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3635,14 +3681,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_update_pacing_rate(sk); + tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3665,8 +3711,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_xmit_recovery(sk, rexmit); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -3997,7 +4043,7 @@ void tcp_reset(struct sock *sk) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ -static void tcp_fin(struct sock *sk) +void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5511,6 +5557,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + + tcp_fastopen_add_skb(sk, synack); + return false; } @@ -6117,9 +6166,10 @@ static bool tcp_syn_flood_action(const struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; + struct net *net = sock_net(sk); #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { + if (net->ipv4.sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = true; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6128,7 +6178,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && - sysctl_tcp_syncookies != 2 && + net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -6161,6 +6211,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; struct request_sock *req; @@ -6171,7 +6222,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * limitations, they conserve resources and peer is * evidently real one. */ - if ((sysctl_tcp_syncookies == 2 || + if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); if (!want_cookie) @@ -6237,7 +6288,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } } /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && + else if (!net->ipv4.sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65947c1f4733..4c8d58dfac9b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -73,7 +73,6 @@ #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> -#include <net/tcp_memcontrol.h> #include <net/busy_poll.h> #include <linux/inet.h> @@ -312,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ -void tcp_req_err(struct sock *sk, u32 seq) +void tcp_req_err(struct sock *sk, u32 seq, bool abort) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); @@ -324,7 +323,7 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { + } else if (abort) { /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly @@ -384,7 +383,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } seq = ntohl(th->seq); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, + type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -638,8 +642,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, - &tcp_hashinfo, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, + ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ @@ -708,7 +712,8 @@ release_sk1: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v4_send_ack(struct net *net, + struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) @@ -723,7 +728,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb_dst(skb)->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -785,7 +789,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(sock_net(sk), skb, + tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, @@ -804,8 +809,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? - tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : + tcp_sk(sk)->snd_nxt; + + tcp_v4_send_ack(sock_net(sk), skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, @@ -858,7 +865,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } - #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of @@ -1580,7 +1586,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->sacked = 0; lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest); if (!sk) goto no_tcp_socket; @@ -1590,28 +1597,30 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (tcp_v4_inbound_md5_hash(sk, skb)) - goto discard_and_relse; - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + reqsk_put(req); + goto discard_it; + } + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } @@ -1694,7 +1703,8 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, + &tcp_hashinfo, skb, + __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -1818,7 +1828,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); - sock_release_memcg(sk); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2342,11 +2354,6 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .init_cgroup = tcp_init_cgroup, - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); @@ -2389,6 +2396,16 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; + net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; + net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; + net->ipv4.sysctl_tcp_syncookies = 1; + net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; + net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; + net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; + net->ipv4.sysctl_tcp_orphan_retries = 0; + net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c deleted file mode 100644 index 2379c1b4efb2..000000000000 --- a/net/ipv4/tcp_memcontrol.c +++ /dev/null @@ -1,233 +0,0 @@ -#include <net/tcp.h> -#include <net/tcp_memcontrol.h> -#include <net/sock.h> -#include <net/ip.h> -#include <linux/nsproxy.h> -#include <linux/memcontrol.h> -#include <linux/module.h> - -int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - /* - * The root cgroup does not use page_counters, but rather, - * rely on the data already collected by the network - * subsystem - */ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return 0; - - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; - - parent_cg = tcp_prot.proto_cgroup(parent); - if (parent_cg) - counter_parent = &parent_cg->memory_allocated; - - page_counter_init(&cg_proto->memory_allocated, counter_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); - - return 0; -} -EXPORT_SYMBOL(tcp_init_cgroup); - -void tcp_destroy_cgroup(struct mem_cgroup *memcg) -{ - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return; - - percpu_counter_destroy(&cg_proto->sockets_allocated); - - if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_dec(&memcg_socket_limit_enabled); - -} -EXPORT_SYMBOL(tcp_destroy_cgroup); - -static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - struct cg_proto *cg_proto; - int i; - int ret; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return -EINVAL; - - ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); - if (ret) - return ret; - - for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, nr_pages, - sysctl_tcp_mem[i]); - - if (nr_pages == PAGE_COUNTER_MAX) - clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else { - /* - * The active bit needs to be written after the static_key - * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. - * - * We need to do this, because static_keys will span multiple - * sites, but we can't control their order. If we mark a socket - * as accounted, but the accounting functions are not patched in - * yet, we'll lose accounting. - * - * We never race with the readers in sock_update_memcg(), - * because when this value change, the code to process it is not - * patched in yet. - * - * The activated bit is used to guarantee that no two writers - * will do the update in the same memcg. Without that, we can't - * properly shutdown the static key. - */ - if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_inc(&memcg_socket_limit_enabled); - set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - } - - return 0; -} - -enum { - RES_USAGE, - RES_LIMIT, - RES_MAX_USAGE, - RES_FAILCNT, -}; - -static DEFINE_MUTEX(tcp_limit_mutex); - -static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long nr_pages; - int ret = 0; - - buf = strstrip(buf); - - switch (of_cft(of)->private) { - case RES_LIMIT: - /* see memcontrol.c */ - ret = page_counter_memparse(buf, "-1", &nr_pages); - if (ret) - break; - mutex_lock(&tcp_limit_mutex); - ret = tcp_update_limit(memcg, nr_pages); - mutex_unlock(&tcp_limit_mutex); - break; - default: - ret = -EINVAL; - break; - } - return ret ?: nbytes; -} - -static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); - u64 val; - - switch (cft->private) { - case RES_LIMIT: - if (!cg_proto) - return PAGE_COUNTER_MAX; - val = cg_proto->memory_allocated.limit; - val *= PAGE_SIZE; - break; - case RES_USAGE: - if (!cg_proto) - val = atomic_long_read(&tcp_memory_allocated); - else - val = page_counter_read(&cg_proto->memory_allocated); - val *= PAGE_SIZE; - break; - case RES_FAILCNT: - if (!cg_proto) - return 0; - val = cg_proto->memory_allocated.failcnt; - break; - case RES_MAX_USAGE: - if (!cg_proto) - return 0; - val = cg_proto->memory_allocated.watermark; - val *= PAGE_SIZE; - break; - default: - BUG(); - } - return val; -} - -static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - memcg = mem_cgroup_from_css(of_css(of)); - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return nbytes; - - switch (of_cft(of)->private) { - case RES_MAX_USAGE: - page_counter_reset_watermark(&cg_proto->memory_allocated); - break; - case RES_FAILCNT: - cg_proto->memory_allocated.failcnt = 0; - break; - } - - return nbytes; -} - -static struct cftype tcp_files[] = { - { - .name = "kmem.tcp.limit_in_bytes", - .write = tcp_cgroup_write, - .read_u64 = tcp_cgroup_read, - .private = RES_LIMIT, - }, - { - .name = "kmem.tcp.usage_in_bytes", - .read_u64 = tcp_cgroup_read, - .private = RES_USAGE, - }, - { - .name = "kmem.tcp.failcnt", - .private = RES_FAILCNT, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { - .name = "kmem.tcp.max_usage_in_bytes", - .private = RES_MAX_USAGE, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { } /* terminate */ -}; - -static int __init tcp_memcontrol_init(void) -{ - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files)); - return 0; -} -__initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c8cbc2b4b792..c26241f3057b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; @@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk) if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && - tp->reordering != sysctl_tcp_reordering) + tp->reordering != net->ipv4.sysctl_tcp_reordering) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 75632a925824..fadd8b978951 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -27,9 +27,6 @@ #include <net/inet_common.h> #include <net/xfrm.h> -int sysctl_tcp_syncookies __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_syncookies); - int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9864a2dbadce..773083b7f1e9 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->fin = th->psh = 0; th->check = newcheck; - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); seq += mss; @@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); out: return segs; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 412a920fe0ec..7d2c7a400456 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; -EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -2813,13 +2810,16 @@ begin_fwd: */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt, status; + int amt; if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - sk_memory_allocated_add(sk, amt, &status); + sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_charge_skmem(sk->sk_memcg, amt); } /* Send a FIN. The caller locks the socket for us. @@ -3473,6 +3473,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); unsigned long probe_max; int err; @@ -3486,7 +3487,7 @@ void tcp_send_probe0(struct sock *sk) } if (err <= 0) { - if (icsk->icsk_backoff < sysctl_tcp_retries2) + if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; probe_max = TCP_RTO_MAX; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a4730a28b220..49bc474f8e35 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,11 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; -int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; -int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; -int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_err(struct sock *sk) @@ -82,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* Calculate maximal number or retries on an orphaned socket. */ static int tcp_orphan_retries(struct sock *sk, bool alive) { - int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (sk->sk_err_soft && !alive) @@ -157,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int retry_until; bool do_reset, syn_set = false; @@ -169,10 +165,10 @@ static int tcp_write_timeout(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } - retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts with @@ -181,7 +177,7 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_data_acked && tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == sysctl_tcp_retries1) + if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -191,7 +187,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } - retry_until = sysctl_tcp_retries2; + retry_until = net->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -305,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk) (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) goto abort; - max_probes = sysctl_tcp_retries2; + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -332,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : - sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -360,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); if (tp->fastopen_rsk) { @@ -490,7 +487,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); out:; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc45b538e237..836abe58a9c5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -356,8 +356,8 @@ EXPORT_SYMBOL(udp_lib_get_port); * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -499,6 +499,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sock *sk, *result; struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; begin: @@ -512,14 +513,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -563,6 +568,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; rcu_read_lock(); @@ -601,14 +607,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -838,32 +848,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, { struct udphdr *uh = udp_hdr(skb); - if (nocheck) + if (nocheck) { uh->check = 0; - else if (skb_is_gso(skb)) + } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v4_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp_set_csum); @@ -1038,8 +1036,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, sk->sk_family == AF_INET6); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; connected = 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4c519c1dc161..f5abb1ae1358 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -32,42 +32,58 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features), __be16 new_protocol, bool is_ipv6) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool remcsum, need_csum, offload_csum, ufo; struct sk_buff *segs = ERR_PTR(-EINVAL); + struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; - netdev_features_t enc_features; + u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum = !!(skb_shinfo(skb)->gso_type & - SKB_GSO_UDP_TUNNEL_CSUM); - bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); - bool offload_csum = false, dont_encap = (need_csum || remcsum); - - oldlen = (u16)~skb->len; + u32 partial; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; + /* adjust partial header checksum to negate old length */ + partial = (__force u32)uh->check + (__force u16)~uh->len; + + /* setup inner skb. */ skb->encapsulation = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); skb->encap_hdr_csum = need_csum; + + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + /* Try to offload checksum if possible */ offload_csum = !!(need_csum && - ((skb->dev->features & NETIF_F_HW_CSUM) || - (skb->dev->features & (is_ipv6 ? - NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); + (skb->dev->features & + (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : + (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags and + * instead set the flag based on our outer checksum offload value. + */ + if (remcsum || ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum || offload_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = gso_inner_segment(skb, enc_features); + segs = gso_inner_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -78,17 +94,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, udp_offset = outer_hlen - tnl_hlen; skb = segs; do { - struct udphdr *uh; - int len; - __be32 delta; + __be16 len; - if (dont_encap) { - skb->encapsulation = 0; + if (remcsum) skb->ip_summed = CHECKSUM_NONE; - } else { - /* Only set up inner headers if we might be offloading - * inner checksum. - */ + + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } @@ -96,43 +108,28 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = mac_len; skb->protocol = protocol; - skb_push(skb, outer_hlen); + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; + len = htons(skb->len - udp_offset); uh = udp_hdr(skb); - uh->len = htons(len); + uh->len = len; if (!need_csum) continue; - delta = htonl(oldlen + len); - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - if (offload_csum) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - } else if (remcsum) { - /* Need to calculate checksum from scratch, - * inner checksums are never when doing - * remote_checksum_offload. - */ - - skb->csum = skb_checksum(skb, udp_offset, - skb->len - udp_offset, - 0); - uh->check = csum_fold(skb->csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { - uh->check = gso_make_checksum(skb, ~uh->check); + ((__force u32)len + partial)); + if (skb->encapsulation || !offload_csum) { + uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); } } while ((skb = skb->next)); out: @@ -235,6 +232,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index bb7dabe2ebbf..11e875ffd7ac 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -69,6 +69,7 @@ config INET6_ESP select CRYPTO_CBC select CRYPTO_SHA1 select CRYPTO_DES + select CRYPTO_ECHAINIV ---help--- Support for IPsec ESP. @@ -206,6 +207,7 @@ config IPV6_NDISC_NODETYPE config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL + select DST_CACHE ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 38eeddedfc21..a2d6f6c242af 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; /* Check if a valid qdisc is available */ @@ -583,7 +585,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; @@ -3168,6 +3170,55 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +static int fixup_permanent_addr(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + if (!ifp->rt) { + struct rt6_info *rt; + + rt = addrconf_dst_alloc(idev, &ifp->addr, false); + if (unlikely(IS_ERR(rt))) + return PTR_ERR(rt); + + ifp->rt = rt; + } + + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + idev->dev, 0, 0); + } + + addrconf_dad_start(ifp); + + return 0; +} + +static void addrconf_permanent_addr(struct net_device *dev) +{ + struct inet6_ifaddr *ifp, *tmp; + struct inet6_dev *idev; + + idev = __in6_dev_get(dev); + if (!idev) + return; + + write_lock_bh(&idev->lock); + + list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { + if ((ifp->flags & IFA_F_PERMANENT) && + fixup_permanent_addr(idev, ifp) < 0) { + write_unlock_bh(&idev->lock); + ipv6_del_addr(ifp); + write_lock_bh(&idev->lock); + + net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n", + idev->dev->name, &ifp->addr); + } + } + + write_unlock_bh(&idev->lock); +} + static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -3253,6 +3304,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } + /* restore routes for permanent addresses */ + addrconf_permanent_addr(dev); + switch (dev->type) { #if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: @@ -3356,7 +3410,10 @@ static int addrconf_ifdown(struct net_device *dev, int how) { struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + struct inet6_ifaddr *ifa, *tmp; + struct list_head del_list; + int _keep_addr; + bool keep_addr; int state, i; ASSERT_RTNL(); @@ -3383,6 +3440,16 @@ static int addrconf_ifdown(struct net_device *dev, int how) } + /* aggregate the system setting and interface setting */ + _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + if (!_keep_addr) + _keep_addr = idev->cnf.keep_addr_on_down; + + /* combine the user config with event to determine if permanent + * addresses are to be removed from address hash table + */ + keep_addr = !(how || _keep_addr <= 0); + /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { struct hlist_head *h = &inet6_addr_lst[i]; @@ -3391,9 +3458,15 @@ static int addrconf_ifdown(struct net_device *dev, int how) restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { - hlist_del_init_rcu(&ifa->addr_lst); addrconf_del_dad_work(ifa); - goto restart; + /* combined flag + permanent flag decide if + * address is retained on a down event + */ + if (!keep_addr || + !(ifa->flags & IFA_F_PERMANENT)) { + hlist_del_init_rcu(&ifa->addr_lst); + goto restart; + } } } spin_unlock_bh(&addrconf_hash_lock); @@ -3427,31 +3500,53 @@ restart: write_lock_bh(&idev->lock); } - while (!list_empty(&idev->addr_list)) { - ifa = list_first_entry(&idev->addr_list, - struct inet6_ifaddr, if_list); - addrconf_del_dad_work(ifa); + /* re-combine the user config with event to determine if permanent + * addresses are to be removed from the interface list + */ + keep_addr = (!how && _keep_addr > 0); - list_del(&ifa->if_list); + INIT_LIST_HEAD(&del_list); + list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + addrconf_del_dad_work(ifa); write_unlock_bh(&idev->lock); - spin_lock_bh(&ifa->lock); - state = ifa->state; - ifa->state = INET6_IFADDR_STATE_DEAD; + + if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) { + /* set state to skip the notifier below */ + state = INET6_IFADDR_STATE_DEAD; + ifa->state = 0; + if (!(ifa->flags & IFA_F_NODAD)) + ifa->flags |= IFA_F_TENTATIVE; + } else { + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + + list_del(&ifa->if_list); + list_add(&ifa->if_list, &del_list); + } + spin_unlock_bh(&ifa->lock); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); } - in6_ifa_put(ifa); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); + /* now clean up addresses to be removed */ + while (!list_empty(&del_list)) { + ifa = list_first_entry(&del_list, + struct inet6_ifaddr, if_list); + list_del(&ifa->if_list); + + in6_ifa_put(ifa); + } + /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -3538,6 +3633,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; + bool notify = false; addrconf_join_solict(dev, &ifp->addr); @@ -3583,7 +3679,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ - ipv6_ifa_notify(RTM_NEWADDR, ifp); + notify = true; } } @@ -3591,6 +3687,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) out: spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); + if (notify) + ipv6_ifa_notify(RTM_NEWADDR, ifp); } static void addrconf_dad_start(struct inet6_ifaddr *ifp) @@ -4711,6 +4809,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; + array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; + array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; + array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; } static inline size_t inet6_ifla6_size(void) @@ -5192,10 +5293,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (rt) ip6_del_rt(rt); } - dst_hold(&ifp->rt->dst); - - ip6_del_rt(ifp->rt); + if (ifp->rt) { + dst_hold(&ifp->rt->dst); + ip6_del_rt(ifp->rt); + ifp->rt = NULL; + } rt_genid_bump_ipv6(net); break; } @@ -5785,6 +5888,28 @@ static struct addrconf_sysctl_table .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { + .procname = "drop_unicast_in_l2_multicast", + .data = &ipv6_devconf.drop_unicast_in_l2_multicast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_unsolicited_na", + .data = &ipv6_devconf.drop_unsolicited_na, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "keep_addr_on_down", + .data = &ipv6_devconf.keep_addr_on_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, + { /* sentinel */ } }, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9f5137cd604e..b11c37cfd67c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -235,7 +235,11 @@ lookup_protocol: * creation time automatically shares. */ inet->inet_sport = htons(inet->inet_num); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 517c55b01ba8..428162155280 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -162,6 +162,9 @@ ipv4_connected: fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) fl6.flowi6_oif = np->mcast_oif; diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 32dc9aab7297..30613050e4ca 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -99,5 +99,6 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); +MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 36c3f0155010..532c3ef282c5 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -26,6 +26,7 @@ #include <net/ip6_route.h> #include <net/sock.h> #include <net/inet6_connection_sock.h> +#include <net/sock_reuseport.h> int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax) @@ -48,6 +49,7 @@ int inet6_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 21ace5a2bf7c..70f2628be6fa 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -17,11 +17,13 @@ #include <linux/module.h> #include <linux/random.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -121,7 +123,9 @@ static inline int compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { @@ -129,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net, const struct hlist_nulls_node *node; struct sock *result; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; @@ -146,6 +151,15 @@ begin: if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -163,11 +177,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -177,6 +193,7 @@ begin: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) @@ -184,7 +201,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sock *sk; local_bh_disable(); - sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + ntohs(dport), dif); local_bh_enable(); return sk; @@ -274,3 +292,59 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); + +int inet6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + local_bh_enable(); + } + + return 0; +} +EXPORT_SYMBOL_GPL(inet6_hash); + +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 9a4d7322fb22..8f920580976f 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -98,27 +98,16 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb, uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v6_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1f9ebe3cbb4a..dc2db4f7b182 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp)) != NULL; + (sfl = rcu_dereference_protected(*sflp, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = rcu_dereference(sfl->next); + *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f37f18b6b40c..f7c9560b75fa 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -360,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink(ign, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -633,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(tunnel); + dst = dst_cache_get(&tunnel->dst_cache); if (!dst) { dst = ip6_route_output(net, NULL, fl6); @@ -702,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(tunnel, ndst); + dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); proto = NEXTHDR_GRE; @@ -1009,7 +1009,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; } @@ -1219,7 +1219,7 @@ static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1257,7 +1257,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(tunnel); + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; @@ -1512,6 +1512,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->destructor = ip6gre_dev_free; dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; } static int ip6gre_newlink(struct net *src_net, struct net_device *dev, diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 9075acf081dd..c05c425c2389 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); @@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) goto err; + /* If enabled, drop unicast packets that were encapsulated in link-layer + * multicast or broadcast to protected against the so-called "hole-196" + * attack in 802.11 wireless. + */ + if (!ipv6_addr_is_multicast(&hdr->daddr) && + (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) && + idev->cnf.drop_unicast_in_l2_multicast) + goto err; + /* RFC4291 2.7 * Nodes must not originate a packet to a multicast address whose scope * field contains the reserved value 0; if such a packet is received, it diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 23de98f976d5..a163102f1803 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -909,6 +909,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct rt6_info *rt; #endif int err; + int flags = 0; /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, @@ -940,10 +941,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, dst_release(*dst); *dst = NULL; } + + if (fl6->flowi6_oif) + flags |= RT6_LOOKUP_F_IFACE; } if (!*dst) - *dst = ip6_route_output(net, sk, fl6); + *dst = ip6_route_output_flags(net, sk, fl6, flags); err = (*dst)->error; if (err) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca42aaa6..3f3aabd2f07b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) return &dev->stats; } -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - write_seqlock_bh(&idst->lock); - dst_release(rcu_dereference_protected( - idst->dst, - lockdep_is_held(&idst->lock.lock))); - if (dst) { - dst_hold(dst); - idst->cookie = rt6_get_cookie((struct rt6_info *)dst); - } else { - idst->cookie = 0; - } - rcu_assign_pointer(idst->dst, dst); - write_sequnlock_bh(&idst->lock); -} - -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) -{ - struct ip6_tnl_dst *idst; - struct dst_entry *dst; - unsigned int seq; - u32 cookie; - - idst = raw_cpu_ptr(t->dst_cache); - - rcu_read_lock(); - do { - seq = read_seqbegin(&idst->lock); - dst = rcu_dereference(idst->dst); - cookie = idst->cookie; - } while (read_seqretry(&idst->lock, seq)); - - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - rcu_read_unlock(); - - if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { - ip6_tnl_per_cpu_dst_set(idst, NULL); - dst_release(dst); - dst = NULL; - } - return dst; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); - -void ip6_tnl_dst_reset(struct ip6_tnl *t) -{ - int i; - - for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); - -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) -{ - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); - -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); - -void ip6_tnl_dst_destroy(struct ip6_tnl *t) -{ - if (!t->dst_cache) - return; - - ip6_tnl_dst_reset(t); - free_percpu(t->dst_cache); -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); - -int ip6_tnl_dst_init(struct ip6_tnl *t) -{ - int i; - - t->dst_cache = alloc_percpu(struct ip6_tnl_dst); - if (!t->dst_cache) - return -ENOMEM; - - for_each_possible_cpu(i) - seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); - - return 0; -} -EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); - /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dst_destroy(t); + dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); dev_put(dev); } @@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_get(t); + dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark && ndst) - ip6_tnl_dst_set(t, ndst); + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); skb->transport_header = skb->network_header; @@ -1366,7 +1275,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; } @@ -1637,7 +1546,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - ret = ip6_tnl_dst_init(t); + ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) { free_percpu(dev->tstats); dev->tstats = NULL; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0a8610b33d79..d90a11f14040 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; - ip6_tnl_dst_reset(t); + dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 84afb9a77278..c245895a3d41 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb) offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; @@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb) return; } + /* For some 802.11 wireless deployments (and possibly other networks), + * there will be a NA proxy and unsolicitd packets are attacks + * and thus should not be accepted. + */ + if (!msg->icmph.icmp6_solicited && idev && + idev->cnf.drop_unsolicited_na) + return; + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 31ba7ca19757..051b6a6bfff6 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -21,6 +21,10 @@ #include <net/ipv6.h> #include <net/netfilter/ipv6/nf_nat_masquerade.h> +#define MAX_WORK_COUNT 16 + +static atomic_t v6_worker_count; + unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, const struct net_device *out) @@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; +struct masq_dev_work { + struct work_struct work; + struct net *net; + int ifindex; +}; + +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + long index; + + w = container_of(work, struct masq_dev_work, work); + + index = w->ifindex; + nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&v6_worker_count); + module_put(THIS_MODULE); +} + +/* ipv6 inet notifier is an atomic notifier, i.e. we cannot + * schedule. + * + * Unfortunately, nf_ct_iterate_cleanup can run for a long + * time if there are lots of conntracks and the system + * handles high softirq load, so it frequently calls cond_resched + * while iterating the conntrack table. + * + * So we defer nf_ct_iterate_cleanup walk to the system workqueue. + * + * As we can have 'a lot' of inet_events (depending on amount + * of ipv6 addresses being deleted), we also need to add an upper + * limit to the number of queued work items. + */ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; + const struct net_device *dev; + struct masq_dev_work *w; + struct net *net; + + if (event != NETDEV_DOWN || + atomic_read(&v6_worker_count) >= MAX_WORK_COUNT) + return NOTIFY_DONE; + + dev = ifa->idev->dev; + net = maybe_get_net(dev_net(dev)); + if (!net) + return NOTIFY_DONE; - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w) { + atomic_inc(&v6_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = dev->ifindex; + w->net = net; + schedule_work(&w->work); + + return NOTIFY_DONE; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); + return NOTIFY_DONE; } static struct notifier_block masq_inet_notifier = { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 18f3498a6c80..e2ea31175ef9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -496,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + skb_postpush_rcsum(head, skb_network_header(head), + skb_network_header_len(head)); rcu_read_lock(); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3c8834bc822d..ed446639219c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1183,11 +1183,10 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, - struct flowi6 *fl6) +struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, + struct flowi6 *fl6, int flags) { struct dst_entry *dst; - int flags = 0; bool any_src; dst = l3mdev_rt6_dst_by_oif(net, fl6); @@ -1208,7 +1207,7 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } -EXPORT_SYMBOL(ip6_route_output); +EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e794ef66a401..f45b8ffc2840 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -201,14 +201,14 @@ static int ipip6_tunnel_create(struct net_device *dev) if ((__force u16)t->parms.i_flags & SIT_ISATAP) dev->priv_flags |= IFF_ISATAP; + dev->rtnl_link_ops = &sit_link_ops; + err = register_netdevice(dev); if (err < 0) goto out; ipip6_tunnel_clone_6rd(dev, sitn); - dev->rtnl_link_ops = &sit_link_ops; - dev_hold(dev); ipip6_tunnel_link(sitn, t); @@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); dev_put(dev); } @@ -740,7 +740,7 @@ static int ipip_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -911,7 +911,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT); if (IS_ERR(skb)) { ip_rt_put(rt); goto out; @@ -1000,7 +1000,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; @@ -1093,7 +1093,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) t->parms.link = p->link; ipip6_tunnel_bind_dev(t->dev); } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } @@ -1124,7 +1124,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } @@ -1278,7 +1278,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); break; @@ -1339,7 +1339,7 @@ static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1372,6 +1372,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + int err; tunnel->dev = dev; tunnel->net = dev_net(dev); @@ -1382,10 +1383,10 @@ static int ipip6_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } return 0; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 2906ef20795e..0e393ff7f5d0 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -148,7 +148,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst; __u8 rcv_wscale; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index db9f1c318afc..33f2820181f9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -61,7 +61,6 @@ #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> -#include <net/tcp_memcontrol.h> #include <net/busy_poll.h> #include <linux/proc_fs.h> @@ -328,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; + bool fatal; int err; sk = __inet6_lookup_established(net, &tcp_hashinfo, @@ -346,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } seq = ntohl(th->seq); + fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, fatal); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -401,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { @@ -867,7 +867,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), - &tcp_hashinfo, &ipv6h->saddr, + &tcp_hashinfo, NULL, 0, + &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) @@ -1376,8 +1377,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest, - inet6_iif(skb)); + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1387,7 +1388,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; tcp_v6_fill_cb(skb, hdr, th); @@ -1395,24 +1396,24 @@ process: reqsk_put(req); goto discard_it; } - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); tcp_v6_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } @@ -1501,6 +1502,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif(skb)); @@ -1866,7 +1868,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, @@ -1889,9 +1891,6 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .proto_cgroup = tcp_proto_cgroup, -#endif .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5d2c2afffe7b..0711f8fe4d44 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> +#include <net/addrconf.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> @@ -77,49 +78,6 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only - */ -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { - if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) - return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) - return match_wildcard; - } - return 0; - } - - if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} - static u32 udp6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) @@ -257,6 +215,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct sock *sk, *result; struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; begin: @@ -270,14 +229,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -321,6 +284,7 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; + bool select_ok = true; u32 hash = 0; rcu_read_lock(); @@ -358,14 +322,18 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - sk2 = reuseport_select_sock(sk, hash, skb, + if (select_ok) { + struct sock *sk2; + + sk2 = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - goto found; + if (sk2) { + result = sk2; + select_ok = false; + goto found; + } } matches = 1; } @@ -580,6 +548,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); struct sock *sk; + int harderr; int err; struct net *net = dev_net(skb->dev); @@ -591,26 +560,27 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } + harderr = icmpv6_err_convert(type, code, &err); + np = inet6_sk(sk); + if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) goto out; ip6_sk_update_pmtu(skb, sk, info); + if (np->pmtudisc != IPV6_PMTUDISC_DONT) + harderr = 1; } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); goto out; } - np = inet6_sk(sk); - - if (!icmpv6_err_convert(type, code, &err) && !np->recverr) - goto out; - - if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) - goto out; - - if (np->recverr) + if (!np->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + } sk->sk_err = err; sk->sk_error_report(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 7441e1e63893..2b0fbe6929e8 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -81,12 +81,18 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, csum = skb_checksum(skb, 0, skb->len, 0); uh->check = udp_v6_check(skb->len, &ipv6h->saddr, &ipv6h->daddr, csum); - if (uh->check == 0) uh->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Check if there is enough headroom to insert fragment header. */ tnl_hlen = skb_tnl_header_len(skb); if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index f7fbdbabe50e..372855eeaf42 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -23,7 +23,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP6_ECN_set_ce(inner_iph); + IP6_ECN_set_ce(skb, inner_iph); } /* Add encapsulation header. diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c index 3c4caa60c926..5728e76ca6d5 100644 --- a/net/irda/ircomm/ircomm_param.c +++ b/net/irda/ircomm/ircomm_param.c @@ -134,11 +134,10 @@ int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush) return -1; } skb_put(skb, count); + pr_debug("%s(), skb->len=%d\n", __func__, skb->len); spin_unlock_irqrestore(&self->spinlock, flags); - pr_debug("%s(), skb->len=%d\n", __func__ , skb->len); - if (flush) { /* ircomm_tty_do_softint will take care of the rest */ schedule_work(&self->tqueue); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ef50a94d3eb7..fc3598a922b0 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -708,6 +708,9 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, if (!addr || addr->sa_family != AF_IUCV) return -EINVAL; + if (addr_len < sizeof(struct sockaddr_iucv)) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != IUCV_OPEN) { err = -EBADFD; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index a2c8747d2936..6b54ff3ff4cb 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -25,6 +25,7 @@ #include <net/udp.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> @@ -718,7 +719,7 @@ static struct proto l2tp_ip6_prot = { .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, - .hash = inet_hash, + .hash = inet6_hash, .unhash = inet_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), #ifdef CONFIG_COMPAT diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f93c5be612a7..2caaa84ce92d 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family, ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); @@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family, ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 8e5ead366e7f..e925037fa0df 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -17,7 +17,7 @@ * @dev: targeted interface */ -int l3mdev_master_ifindex_rcu(struct net_device *dev) +int l3mdev_master_ifindex_rcu(const struct net_device *dev) { int ifindex = 0; @@ -28,8 +28,15 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev) ifindex = dev->ifindex; } else if (netif_is_l3_slave(dev)) { struct net_device *master; + struct net_device *_dev = (struct net_device *)dev; - master = netdev_master_upper_dev_get_rcu(dev); + /* netdev_master_upper_dev_get_rcu calls + * list_first_or_null_rcu to walk the upper dev list. + * list_first_or_null_rcu does not handle a const arg. We aren't + * making changes, just want the master device from that list so + * typecast to remove the const + */ + master = netdev_master_upper_dev_get_rcu(_dev); if (master) ifindex = master->ifindex; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8dab4e569571..b3c52e3f689a 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; -static int llc_ui_wait_for_conn(struct sock *sk, long timeout); +static long llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); @@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) return rc; } -static int llc_ui_wait_for_conn(struct sock *sk, long timeout) +static long llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT(wait); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e433d0c97e86..4ab5c522ceee 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -91,7 +91,7 @@ static const struct file_operations reset_ops = { }; #endif -static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), @@ -127,9 +127,6 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), FLAG(SUPPORTS_REORDERING_BUFFER), - - /* keep last for the build bug below */ - (void *)0x1 #undef FLAG }; @@ -149,7 +146,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, /* fail compilation if somebody adds or removes * a flag without updating the name array above */ - BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9b983788ee51..fc3238376b39 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1739,7 +1739,6 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) if (sdata->vif.type != NL80211_IFTYPE_ADHOC) continue; sdata->u.ibss.last_scan_completed = jiffies; - ieee80211_queue_work(&local->hw, &sdata->work); } mutex_unlock(&local->iflist_mtx); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6bcf0faa4a89..8190bf27ebff 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -248,6 +248,7 @@ static void ieee80211_restart_work(struct work_struct *work) /* wait for scan work complete */ flush_workqueue(local->workqueue); + flush_work(&local->sched_scan_stopped_work); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), "%s called with hardware scan in progress\n", __func__); @@ -256,6 +257,11 @@ static void ieee80211_restart_work(struct work_struct *work) list_for_each_entry(sdata, &local->interfaces, list) flush_delayed_work(&sdata->dec_tailroom_needed_wk); ieee80211_scan_cancel(local); + + /* make sure any new ROC will consider local->in_reconfig */ + flush_delayed_work(&local->roc_work); + flush_work(&local->hw_roc_done); + ieee80211_reconfig(local); rtnl_unlock(); } diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 9a8e7b57c86e..d32cefcb63b0 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1369,17 +1369,6 @@ out: sdata_unlock(sdata); } -void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) -{ - struct ieee80211_sub_if_data *sdata; - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) - if (ieee80211_vif_is_mesh(&sdata->vif) && - ieee80211_sdata_running(sdata)) - ieee80211_queue_work(&local->hw, &sdata->work); - rcu_read_unlock(); -} void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) { diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index d941f3a73a4f..87c017a3b1ce 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -359,14 +359,10 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP; } -void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local); - void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); void ieee80211s_stop(void); #else -static inline void -ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {} static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) { return false; } static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f41625bcd879..281b8d6e5109 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3980,8 +3980,6 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.monitor_work); - /* and do all the other regular work too */ - ieee80211_queue_work(&sdata->local->hw, &sdata->work); } } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 8b2f4eaac2ba..55a9c5b94ce1 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -252,14 +252,11 @@ static bool ieee80211_recalc_sw_work(struct ieee80211_local *local, static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, unsigned long start_time) { - struct ieee80211_local *local = roc->sdata->local; - if (WARN_ON(roc->notified)) return; roc->start_time = start_time; roc->started = true; - roc->hw_begun = true; if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { @@ -274,9 +271,6 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, } roc->notified = true; - - if (!local->ops->remain_on_channel) - ieee80211_recalc_sw_work(local, start_time); } static void ieee80211_hw_roc_start(struct work_struct *work) @@ -291,6 +285,7 @@ static void ieee80211_hw_roc_start(struct work_struct *work) if (!roc->started) break; + roc->hw_begun = true; ieee80211_handle_roc_started(roc, local->hw_roc_start_time); } @@ -413,6 +408,10 @@ void ieee80211_start_next_roc(struct ieee80211_local *local) return; } + /* defer roc if driver is not started (i.e. during reconfig) */ + if (local->in_reconfig) + return; + roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); @@ -534,8 +533,10 @@ ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local, * begin, otherwise they'll both be marked properly by the work * struct that runs once the driver notifies us of the beginning */ - if (cur_roc->hw_begun) + if (cur_roc->hw_begun) { + new_roc->hw_begun = true; ieee80211_handle_roc_started(new_roc, now); + } return true; } @@ -658,6 +659,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, queued = true; roc->on_channel = tmp->on_channel; ieee80211_handle_roc_started(roc, now); + ieee80211_recalc_sw_work(local, now); break; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index a413e52f7691..ae980ce8daff 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -314,6 +314,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; + struct ieee80211_sub_if_data *sdata; lockdep_assert_held(&local->mtx); @@ -373,7 +374,16 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); - ieee80211_mesh_notify_scan_completed(local); + + /* Requeue all the work that might have been ignored while + * the scan was in progress; if there was none this will + * just be a no-op for the particular interface. + */ + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (ieee80211_sdata_running(sdata)) + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + } + if (was_scanning) ieee80211_start_next_roc(local); } @@ -1213,6 +1223,14 @@ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) trace_api_sched_scan_stopped(local); + /* + * this shouldn't really happen, so for simplicity + * simply ignore it, and let mac80211 reconfigure + * the sched scan later on. + */ + if (local->in_reconfig) + return; + schedule_work(&local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 87b7e7a7df6c..d20bab5c146c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1461,7 +1461,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); - if (reason == IEEE80211_FRAME_RELEASE_PSPOLL) + if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 5bad05e9af90..6101deb805a8 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -51,6 +51,11 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct ieee80211_hdr *hdr = (void *)skb->data; int ac; + if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { + ieee80211_free_txskb(&local->hw, skb); + return; + } + /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 89f71799df84..7390de4946a9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2046,16 +2046,26 @@ int ieee80211_reconfig(struct ieee80211_local *local) */ if (sched_scan_req->n_scan_plans > 1 || __ieee80211_request_sched_scan_start(sched_scan_sdata, - sched_scan_req)) + sched_scan_req)) { + RCU_INIT_POINTER(local->sched_scan_sdata, NULL); + RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_stopped = true; + } mutex_unlock(&local->mtx); if (sched_scan_stopped) cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); wake_up: - local->in_reconfig = false; - barrier(); + if (local->in_reconfig) { + local->in_reconfig = false; + barrier(); + + /* Restart deferred ROCs */ + mutex_lock(&local->mtx); + ieee80211_start_next_roc(local); + mutex_unlock(&local->mtx); + } if (local->monitors == local->open_count && local->monitors > 0) ieee80211_add_virtual_monitor(local); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index fb31aa87de81..644a8da6d4bd 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -227,5 +227,6 @@ static void __exit mpls_iptunnel_exit(void) } module_exit(mpls_iptunnel_exit); +MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8c067e6663a1..95e757c377f9 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -891,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IP6_NF_IPTABLES != n + select NF_DUP_IPV6 if IPV6 ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 43d8c9896fa3..f0f688db6213 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -164,8 +164,6 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (e.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; @@ -377,8 +375,6 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (e.cidr == 0) - return -EINVAL; if (adt == IPSET_TEST) e.cidr = HOST_MASK; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3264cb49b333..a3f5cd9b3c4c 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1019,8 +1019,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)); if (IS_ERR(skb)) goto tx_error; @@ -1112,8 +1111,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads( - skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); + skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)); if (IS_ERR(skb)) goto tx_error; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3cb3cb831591..f60b4fdeeb8c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,6 +66,21 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly bool nf_conntrack_locks_all; + +void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) +{ + spin_lock(lock); + while (unlikely(nf_conntrack_locks_all)) { + spin_unlock(lock); + spin_lock(&nf_conntrack_locks_all_lock); + spin_unlock(&nf_conntrack_locks_all_lock); + spin_lock(lock); + } +} +EXPORT_SYMBOL_GPL(nf_conntrack_lock); + static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) { h1 %= CONNTRACK_LOCKS; @@ -82,12 +97,12 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; if (h1 <= h2) { - spin_lock(&nf_conntrack_locks[h1]); + nf_conntrack_lock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_lock_nested(&nf_conntrack_locks[h2], SINGLE_DEPTH_NESTING); } else { - spin_lock(&nf_conntrack_locks[h2]); + nf_conntrack_lock(&nf_conntrack_locks[h2]); spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } @@ -102,16 +117,19 @@ static void nf_conntrack_all_lock(void) { int i; - for (i = 0; i < CONNTRACK_LOCKS; i++) - spin_lock_nested(&nf_conntrack_locks[i], i); + spin_lock(&nf_conntrack_locks_all_lock); + nf_conntrack_locks_all = true; + + for (i = 0; i < CONNTRACK_LOCKS; i++) { + spin_lock(&nf_conntrack_locks[i]); + spin_unlock(&nf_conntrack_locks[i]); + } } static void nf_conntrack_all_unlock(void) { - int i; - - for (i = 0; i < CONNTRACK_LOCKS; i++) - spin_unlock(&nf_conntrack_locks[i]); + nf_conntrack_locks_all = false; + spin_unlock(&nf_conntrack_locks_all_lock); } unsigned int nf_conntrack_htable_size __read_mostly; @@ -757,7 +775,7 @@ restart: hash = hash_bucket(_hash, net); for (; i < net->ct.htable_size; i++) { lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; - spin_lock(lockp); + nf_conntrack_lock(lockp); if (read_seqcount_retry(&net->ct.generation, sequence)) { spin_unlock(lockp); goto restart; @@ -1382,7 +1400,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), for (; *bucket < net->ct.htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); - spin_lock(lockp); + nf_conntrack_lock(lockp); if (*bucket < net->ct.htable_size) { hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -1394,6 +1412,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), } spin_unlock(lockp); local_bh_enable(); + cond_resched(); } for_each_possible_cpu(cpu) { @@ -1406,6 +1425,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), set_bit(IPS_DYING_BIT, &ct->status); } spin_unlock_bh(&pcpu->lock); + cond_resched(); } return NULL; found: @@ -1422,6 +1442,8 @@ void nf_ct_iterate_cleanup(struct net *net, struct nf_conn *ct; unsigned int bucket = 0; + might_sleep(); + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1430,6 +1452,7 @@ void nf_ct_iterate_cleanup(struct net *net, /* ... else the timer will get him soon. */ nf_ct_put(ct); + cond_resched(); } } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index bd9d31537905..3b40ec575cd5 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -425,7 +425,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, } local_bh_disable(); for (i = 0; i < net->ct.htable_size; i++) { - spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); if (i < net->ct.htable_size) { hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) unhelp(h, me); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dbb1bb3edb45..355e8552fd5b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -840,7 +840,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; - spin_lock(lockp); + nf_conntrack_lock(lockp); if (cb->args[0] >= net->ct.htable_size) { spin_unlock(lockp); goto out; diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index b6605e000801..5eefe4a355c6 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -224,12 +224,12 @@ static int __init nf_tables_netdev_init(void) nft_register_chain_type(&nft_filter_chain_netdev); ret = register_pernet_subsys(&nf_tables_netdev_net_ops); - if (ret < 0) + if (ret < 0) { nft_unregister_chain_type(&nft_filter_chain_netdev); - + return ret; + } register_netdevice_notifier(&nf_tables_netdev_notifier); - - return ret; + return 0; } static void __exit nf_tables_netdev_exit(void) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a7ba23353dab..2278d9ab723b 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -127,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) -{ - return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); -} -EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); - int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { @@ -311,14 +304,14 @@ replay: #endif { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } @@ -328,10 +321,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ @@ -406,7 +401,7 @@ ack: * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); status |= NFNL_BATCH_FAILURE; goto done; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 5d010f27ac01..2671b9deb103 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -307,7 +307,7 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) local_bh_disable(); for (i = 0; i < net->ct.htable_size; i++) { - spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); if (i < net->ct.htable_size) { hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) untimeout(h, timeout); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 8ca932057c13..11f81c8385fc 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -330,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size, * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC); + skb = alloc_skb(n, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = nfnetlink_alloc_skb(net, pkt_size, - peer_portid, GFP_ATOMIC); + skb = alloc_skb(pkt_size, GFP_ATOMIC); } } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1d3936587ace..75429997ed41 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0, rem_len = 0; + size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -361,7 +361,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; - rem_len = data_len - hlen; break; } @@ -386,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, - GFP_ATOMIC); + skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); return NULL; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 383c17138399..b78c28ba465f 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -46,16 +46,14 @@ static void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { - src64 = get_unaligned_be64(&src[i]); - src64 = be64_to_cpu((__force __be64)src64); + src64 = get_unaligned((u64 *)&src[i]); put_unaligned_be64(src64, &dst[i]); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = get_unaligned_be64(&src[i]); - src64 = (__force u64)cpu_to_be64(src64); - put_unaligned_be64(src64, &dst[i]); + put_unaligned(src64, (u64 *)&dst[i]); } break; } diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c7808fc19719..c9743f78f219 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -100,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx, cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); @@ -138,7 +138,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, GFP_ATOMIC); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a0eb2161e3ef..d4a4619fcebc 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -127,6 +127,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, NF_CT_LABELS_MAX_SIZE - size); return; } +#endif case NFT_CT_BYTES: /* fallthrough */ case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); @@ -138,7 +139,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, &count, sizeof(count)); return; } -#endif default: break; } diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index b7c43def0dc6..e118397254af 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -228,7 +228,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; - __be16 frag_off; + __be16 frag_off, oldlen, newlen; int tcphoff; int ret; @@ -244,7 +244,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); - ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret); + oldlen = ipv6h->payload_len; + newlen = htons(ntohs(oldlen) + ret); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(csum_sub(skb->csum, oldlen), + newlen); + ipv6h->payload_len = newlen; } return XT_CONTINUE; } diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3eff7b67cdf2..6e57a3966dc5 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -131,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) { .name = "TEE", .revision = 1, diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 3ab591e73ec0..7f4414d26a66 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) * belonging to established connections going through that one. */ static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet_lookup_listener(net, &tcp_hashinfo, skb, + ip_hdrlen(skb) + + __tcp_hdrlen(tcph), saddr, sport, daddr, dport, in->ifindex); @@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, #ifdef XT_TPROXY_HAVE_IPV6 static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; + struct tcphdr *tcph; switch (protocol) { case IPPROTO_TCP: switch (lookup_type) { case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, + tcph = hp; + sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + thoff + __tcp_hdrlen(tcph), saddr, sport, daddr, ntohs(dport), in->ifindex); @@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); @@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); @@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); @@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(par->net, tproto, + sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), hp->source, @@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); @@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(par->net, tproto, - &iph->saddr, laddr, + sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, + tproto, &iph->saddr, laddr, hp->source, lport, par->in, NFT_LOOKUP_LISTENER); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2ec08f04b816..49d14ecad444 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb, * box. */ static struct sock * -xt_socket_get_sock_v4(struct net *net, const u8 protocol, +xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, + return __inet_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -148,6 +149,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); u8 uninitialized_var(protocol); @@ -169,6 +172,10 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, sport = hp->source; daddr = iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, @@ -198,8 +205,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net, } #endif - return xt_socket_get_sock_v4(net, protocol, saddr, daddr, - sport, dport, indev); + return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, + daddr, sport, dport, indev); } static bool @@ -318,14 +325,15 @@ extract_icmp6_fields(const struct sk_buff *skb, } static struct sock * -xt_socket_get_sock_v6(struct net *net, const u8 protocol, +xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, + const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, &tcp_hashinfo, + return inet6_lookup(net, &tcp_hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: @@ -343,6 +351,8 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, __be16 uninitialized_var(dport), uninitialized_var(sport); const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb); + struct sk_buff *data_skb = NULL; + int doff = 0; int thoff = 0, tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); @@ -362,6 +372,10 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, sport = hp->source; daddr = &iph->daddr; dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = tproto == IPPROTO_TCP ? + thoff + __tcp_hdrlen((struct tcphdr *)hp) : + thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { struct ipv6hdr ipv6_var; @@ -373,7 +387,7 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net, return NULL; } - return xt_socket_get_sock_v6(net, tproto, saddr, daddr, + return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig index 2c5e95e9bfbd..5d6e8c05b3d4 100644 --- a/net/netlink/Kconfig +++ b/net/netlink/Kconfig @@ -2,15 +2,6 @@ # Netlink Sockets # -config NETLINK_MMAP - bool "NETLINK: mmaped IO" - ---help--- - This option enables support for memory mapped netlink IO. This - reduces overhead by avoiding copying data between kernel- and - userspace. - - If unsure, say N. - config NETLINK_DIAG tristate "NETLINK: socket monitoring interface" default n diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 81dc1bb6e016..c8416792cce0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, dev_hold(dev); - if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk) wake_up_interruptible(&nlk->wait); } -#ifdef CONFIG_NETLINK_MMAP -static bool netlink_rx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->rx_ring.pg_vec != NULL; -} - -static bool netlink_tx_is_mmaped(struct sock *sk) -{ - return nlk_sk(sk)->tx_ring.pg_vec != NULL; -} - -static __pure struct page *pgvec_to_page(const void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - else - return virt_to_page(addr); -} - -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; i++) { - if (pg_vec[i] != NULL) { - if (is_vmalloc_addr(pg_vec[i])) - vfree(pg_vec[i]); - else - free_pages((unsigned long)pg_vec[i], order); - } - } - kfree(pg_vec); -} - -static void *alloc_one_pg_vec_page(unsigned long order) -{ - void *buffer; - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | - __GFP_NOWARN | __GFP_NORETRY; - - buffer = (void *)__get_free_pages(gfp_flags, order); - if (buffer != NULL) - return buffer; - - buffer = vzalloc((1 << order) * PAGE_SIZE); - if (buffer != NULL) - return buffer; - - gfp_flags &= ~__GFP_NORETRY; - return (void *)__get_free_pages(gfp_flags, order); -} - -static void **alloc_pg_vec(struct netlink_sock *nlk, - struct nl_mmap_req *req, unsigned int order) -{ - unsigned int block_nr = req->nm_block_nr; - unsigned int i; - void **pg_vec; - - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); - if (pg_vec == NULL) - return NULL; - - for (i = 0; i < block_nr; i++) { - pg_vec[i] = alloc_one_pg_vec_page(order); - if (pg_vec[i] == NULL) - goto err1; - } - - return pg_vec; -err1: - free_pg_vec(pg_vec, order, block_nr); - return NULL; -} - - -static void -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, - unsigned int order) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct sk_buff_head *queue; - struct netlink_ring *ring; - - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); -} - -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool tx_ring) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - void **pg_vec = NULL; - unsigned int order = 0; - - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - - if (req->nm_block_nr) { - if (ring->pg_vec != NULL) - return -EBUSY; - - if ((int)req->nm_block_size <= 0) - return -EINVAL; - if (!PAGE_ALIGNED(req->nm_block_size)) - return -EINVAL; - if (req->nm_frame_size < NL_MMAP_HDRLEN) - return -EINVAL; - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) - return -EINVAL; - - ring->frames_per_block = req->nm_block_size / - req->nm_frame_size; - if (ring->frames_per_block == 0) - return -EINVAL; - if (ring->frames_per_block * req->nm_block_nr != - req->nm_frame_nr) - return -EINVAL; - - order = get_order(req->nm_block_size); - pg_vec = alloc_pg_vec(nlk, req, order); - if (pg_vec == NULL) - return -ENOMEM; - } else { - if (req->nm_frame_nr) - return -EINVAL; - } - - mutex_lock(&nlk->pg_vec_lock); - if (atomic_read(&nlk->mapped) == 0) { - __netlink_set_ring(sk, req, tx_ring, pg_vec, order); - mutex_unlock(&nlk->pg_vec_lock); - return 0; - } - - mutex_unlock(&nlk->pg_vec_lock); - - if (pg_vec) - free_pg_vec(pg_vec, order, req->nm_block_nr); - - return -EBUSY; -} - -static void netlink_mm_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_inc(&nlk_sk(sk)->mapped); -} - -static void netlink_mm_close(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct socket *sock = file->private_data; - struct sock *sk = sock->sk; - - if (sk) - atomic_dec(&nlk_sk(sk)->mapped); -} - -static const struct vm_operations_struct netlink_mmap_ops = { - .open = netlink_mm_open, - .close = netlink_mm_close, -}; - -static int netlink_mmap(struct file *file, struct socket *sock, - struct vm_area_struct *vma) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - unsigned long start, size, expected; - unsigned int i; - int err = -EINVAL; - - if (vma->vm_pgoff) - return -EINVAL; - - mutex_lock(&nlk->pg_vec_lock); - - expected = 0; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; - } - - if (expected == 0) - goto out; - - size = vma->vm_end - vma->vm_start; - if (size != expected) - goto out; - - start = vma->vm_start; - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { - if (ring->pg_vec == NULL) - continue; - - for (i = 0; i < ring->pg_vec_len; i++) { - struct page *page; - void *kaddr = ring->pg_vec[i]; - unsigned int pg_num; - - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { - page = pgvec_to_page(kaddr); - err = vm_insert_page(vma, start, page); - if (err < 0) - goto out; - start += PAGE_SIZE; - kaddr += PAGE_SIZE; - } - } - } - - atomic_inc(&nlk->mapped); - vma->vm_ops = &netlink_mmap_ops; - err = 0; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) -{ -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 - struct page *p_start, *p_end; - - /* First page is flushed through netlink_{get,set}_status */ - p_start = pgvec_to_page(hdr + PAGE_SIZE); - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } -#endif -} - -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) -{ - smp_rmb(); - flush_dcache_page(pgvec_to_page(hdr)); - return hdr->nm_status; -} - -static void netlink_set_status(struct nl_mmap_hdr *hdr, - enum nl_mmap_status status) -{ - smp_mb(); - hdr->nm_status = status; - flush_dcache_page(pgvec_to_page(hdr)); -} - -static struct nl_mmap_hdr * -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) -{ - unsigned int pg_vec_pos, frame_off; - - pg_vec_pos = pos / ring->frames_per_block; - frame_off = pos % ring->frames_per_block; - - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); -} - -static struct nl_mmap_hdr * -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, - enum nl_mmap_status status) -{ - struct nl_mmap_hdr *hdr; - - hdr = __netlink_lookup_frame(ring, pos); - if (netlink_get_status(hdr) != status) - return NULL; - - return hdr; -} - -static struct nl_mmap_hdr * -netlink_current_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - return netlink_lookup_frame(ring, ring->head, status); -} - -static void netlink_increment_head(struct netlink_ring *ring) -{ - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; -} - -static void netlink_forward_ring(struct netlink_ring *ring) -{ - unsigned int head = ring->head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, ring->head); - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) - break; - if (hdr->nm_status != NL_MMAP_STATUS_SKIP) - break; - netlink_increment_head(ring); - } while (ring->head != head); -} - -static bool netlink_has_valid_frame(struct netlink_ring *ring) -{ - unsigned int head = ring->head, pos = head; - const struct nl_mmap_hdr *hdr; - - do { - hdr = __netlink_lookup_frame(ring, pos); - if (hdr->nm_status == NL_MMAP_STATUS_VALID) - return true; - pos = pos != 0 ? pos - 1 : ring->frame_max; - } while (pos != head); - - return false; -} - -static bool netlink_dump_space(struct netlink_sock *nlk) -{ - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - unsigned int n; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - return false; - - n = ring->head + ring->frame_max / 2; - if (n > ring->frame_max) - n -= ring->frame_max; - - hdr = __netlink_lookup_frame(ring, n); - - return hdr->nm_status == NL_MMAP_STATUS_UNUSED; -} - -static unsigned int netlink_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int mask; - int err; - - if (nlk->rx_ring.pg_vec != NULL) { - /* Memory mapped sockets don't call recvmsg(), so flow control - * for dumps is performed here. A dump is allowed to continue - * if at least half the ring is unused. - */ - while (nlk->cb_running && netlink_dump_space(nlk)) { - err = netlink_dump(sk); - if (err < 0) { - sk->sk_err = -err; - sk->sk_error_report(sk); - break; - } - } - netlink_rcv_wake(sk); - } - - mask = datagram_poll(file, sock, wait); - - /* We could already have received frames in the normal receive - * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, - * so if mask contains pollin/etc already, there's no point - * walking the ring. - */ - if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - if (netlink_has_valid_frame(&nlk->rx_ring)) - mask |= POLLIN | POLLRDNORM; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } - - spin_lock_bh(&sk->sk_write_queue.lock); - if (nlk->tx_ring.pg_vec) { - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) - mask |= POLLOUT | POLLWRNORM; - } - spin_unlock_bh(&sk->sk_write_queue.lock); - - return mask; -} - -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) -{ - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); -} - -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, - struct netlink_ring *ring, - struct nl_mmap_hdr *hdr) -{ - unsigned int size; - void *data; - - size = ring->frame_size - NL_MMAP_HDRLEN; - data = (void *)hdr + NL_MMAP_HDRLEN; - - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->len = 0; - - skb->destructor = netlink_skb_destructor; - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; - NETLINK_CB(skb).sk = sk; -} - -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, - u32 dst_portid, u32 dst_group, - struct scm_cookie *scm) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - struct sk_buff *skb; - unsigned int maxlen; - int err = 0, len = 0; - - mutex_lock(&nlk->pg_vec_lock); - - ring = &nlk->tx_ring; - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - - do { - unsigned int nm_len; - - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); - if (hdr == NULL) { - if (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending)) - schedule(); - continue; - } - - nm_len = ACCESS_ONCE(hdr->nm_len); - if (nm_len > maxlen) { - err = -EINVAL; - goto out; - } - - netlink_frame_flush_dcache(hdr, nm_len); - - skb = alloc_skb(nm_len, GFP_KERNEL); - if (skb == NULL) { - err = -ENOBUFS; - goto out; - } - __skb_put(skb, nm_len); - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - - netlink_increment_head(ring); - - NETLINK_CB(skb).portid = nlk->portid; - NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).creds = scm->creds; - - err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; - } - - if (unlikely(dst_group)) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_portid, dst_group, - GFP_KERNEL); - } - err = netlink_unicast(sk, skb, dst_portid, - msg->msg_flags & MSG_DONTWAIT); - if (err < 0) - goto out; - len += err; - - } while (hdr != NULL || - (!(msg->msg_flags & MSG_DONTWAIT) && - atomic_read(&nlk->tx_ring.pending))); - - if (len > 0) - err = len; -out: - mutex_unlock(&nlk->pg_vec_lock); - return err; -} - -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) -{ - struct nl_mmap_hdr *hdr; - - hdr = netlink_mmap_hdr(skb); - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_frame_flush_dcache(hdr, hdr->nm_len); - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; - kfree_skb(skb); -} - -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_ring *ring = &nlk->rx_ring; - struct nl_mmap_hdr *hdr; - - spin_lock_bh(&sk->sk_receive_queue.lock); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) { - spin_unlock_bh(&sk->sk_receive_queue.lock); - kfree_skb(skb); - netlink_overrun(sk); - return; - } - netlink_increment_head(ring); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - - hdr->nm_len = skb->len; - hdr->nm_group = NETLINK_CB(skb).dst_group; - hdr->nm_pid = NETLINK_CB(skb).creds.pid; - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); - netlink_set_status(hdr, NL_MMAP_STATUS_COPY); -} - -#else /* CONFIG_NETLINK_MMAP */ -#define netlink_rx_is_mmaped(sk) false -#define netlink_tx_is_mmaped(sk) false -#define netlink_mmap sock_no_mmap -#define netlink_poll datagram_poll -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0 -#endif /* CONFIG_NETLINK_MMAP */ - static void netlink_skb_destructor(struct sk_buff *skb) { -#ifdef CONFIG_NETLINK_MMAP - struct nl_mmap_hdr *hdr; - struct netlink_ring *ring; - struct sock *sk; - - /* If a packet from the kernel to userspace was freed because of an - * error without being delivered to userspace, the kernel must reset - * the status. In the direction userspace to kernel, the status is - * always reset here after the packet was processed and freed. - */ - if (netlink_skb_is_mmaped(skb)) { - hdr = netlink_mmap_hdr(skb); - sk = NETLINK_CB(skb).sk; - - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); - ring = &nlk_sk(sk)->tx_ring; - } else { - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { - hdr->nm_len = 0; - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); - } - ring = &nlk_sk(sk)->rx_ring; - } - - WARN_ON(atomic_read(&ring->pending) == 0); - atomic_dec(&ring->pending); - sock_put(sk); - - skb->head = NULL; - } -#endif if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) @@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); -#ifdef CONFIG_NETLINK_MMAP - if (1) { - struct nl_mmap_req req; - - memset(&req, 0, sizeof(req)); - if (nlk->rx_ring.pg_vec) - __netlink_set_ring(sk, &req, false, NULL, 0); - memset(&req, 0, sizeof(req)); - if (nlk->tx_ring.pg_vec) - __netlink_set_ring(sk, &req, true, NULL, 0); - } -#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); -#ifdef CONFIG_NETLINK_MMAP - mutex_init(&nlk->pg_vec_lock); -#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_S_CONGESTED, &nlk->state)) && - !netlink_skb_is_mmaped(skb)) { + test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) netlink_deliver_tap(skb); -#ifdef CONFIG_NETLINK_MMAP - if (netlink_skb_is_mmaped(skb)) - netlink_queue_mmaped_skb(sk, skb); - else if (netlink_rx_is_mmaped(sk)) - netlink_ring_set_copied(sk, skb); - else -#endif /* CONFIG_NETLINK_MMAP */ - skb_queue_tail(&sk->sk_receive_queue, skb); + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } @@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) int delta; WARN_ON(skb->sk != NULL); - if (netlink_skb_is_mmaped(skb)) - return skb; - delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; @@ -1876,79 +1248,6 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, - unsigned int ldiff, u32 dst_portid, - gfp_t gfp_mask) -{ -#ifdef CONFIG_NETLINK_MMAP - unsigned int maxlen, linear_size; - struct sock *sk = NULL; - struct sk_buff *skb; - struct netlink_ring *ring; - struct nl_mmap_hdr *hdr; - - sk = netlink_getsockbyportid(ssk, dst_portid); - if (IS_ERR(sk)) - goto out; - - ring = &nlk_sk(sk)->rx_ring; - /* fast-path without atomic ops for common case: non-mmaped receiver */ - if (ring->pg_vec == NULL) - goto out_put; - - /* We need to account the full linear size needed as a ring - * slot cannot have non-linear parts. - */ - linear_size = size + ldiff; - if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) - goto out_put; - - skb = alloc_skb_head(gfp_mask); - if (skb == NULL) - goto err1; - - spin_lock_bh(&sk->sk_receive_queue.lock); - /* check again under lock */ - if (ring->pg_vec == NULL) - goto out_free; - - /* check again under lock */ - maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < linear_size) - goto out_free; - - netlink_forward_ring(ring); - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); - if (hdr == NULL) - goto err2; - - netlink_ring_setup_skb(skb, sk, ring, hdr); - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); - atomic_inc(&ring->pending); - netlink_increment_head(ring); - - spin_unlock_bh(&sk->sk_receive_queue.lock); - return skb; - -err2: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - netlink_overrun(sk); -err1: - sock_put(sk); - return NULL; - -out_free: - kfree_skb(skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); -out_put: - sock_put(sk); -out: -#endif - return alloc_skb(size, gfp_mask); -} -EXPORT_SYMBOL_GPL(__netlink_alloc_skb); - int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -2225,8 +1524,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && - optlen >= sizeof(int) && + if (optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -2279,25 +1577,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } err = 0; break; -#ifdef CONFIG_NETLINK_MMAP - case NETLINK_RX_RING: - case NETLINK_TX_RING: { - struct nl_mmap_req req; - - /* Rings might consume more memory than queue limits, require - * CAP_NET_ADMIN. - */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (optlen < sizeof(req)) - return -EINVAL; - if (copy_from_user(&req, optval, sizeof(req))) - return -EFAULT; - err = netlink_set_ring(sk, &req, - optname == NETLINK_TX_RING); - break; - } -#endif /* CONFIG_NETLINK_MMAP */ case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; @@ -2467,18 +1746,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) smp_rmb(); } - /* It's a really convoluted way for userland to ask for mmaped - * sendmsg(), but that's what we've got... - */ - if (netlink_tx_is_mmaped(sk) && - iter_is_iovec(&msg->msg_iter) && - msg->msg_iter.nr_segs == 1 && - msg->msg_iter.iov->iov_base == NULL) { - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, - &scm); - goto out; - } - err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; @@ -2794,8 +2061,7 @@ static int netlink_dump(struct sock *sk) goto errout_skb; } - if (!netlink_rx_is_mmaped(sk) && - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being @@ -2808,15 +2074,12 @@ static int netlink_dump(struct sock *sk) if (alloc_min_size < nlk->max_recvmsg_len) { alloc_size = nlk->max_recvmsg_len; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL | - __GFP_NOWARN | - __GFP_NORETRY); + skb = alloc_skb(alloc_size, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; - skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, - GFP_KERNEL); + skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; @@ -2883,16 +2146,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct netlink_sock *nlk; int ret; - /* Memory mapped dump requests need to be copied to avoid looping - * on the pending state in netlink_mmap_sendmsg() while the CB hold - * a reference to the skb. - */ - if (netlink_skb_is_mmaped(skb)) { - skb = skb_copy(skb, GFP_KERNEL); - if (skb == NULL) - return -ENOBUFS; - } else - atomic_inc(&skb->users); + atomic_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { @@ -2965,8 +2219,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); - skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), - NETLINK_CB(in_skb).portid, GFP_KERNEL); + skb = nlmsg_new(payload, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -3240,7 +2493,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = netlink_poll, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -3248,7 +2501,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = netlink_mmap, + .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 14437d9b1965..e68ef9ccd703 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -44,12 +44,6 @@ struct netlink_sock { int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); struct module *module; -#ifdef CONFIG_NETLINK_MMAP - struct mutex pg_vec_lock; - struct netlink_ring rx_ring; - struct netlink_ring tx_ring; - atomic_t mapped; -#endif /* CONFIG_NETLINK_MMAP */ struct rhash_head node; struct rcu_head rcu; @@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } -static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ -#ifdef CONFIG_NETLINK_MMAP - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -#else - return false; -#endif /* CONFIG_NETLINK_MMAP */ -} - struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 3ee63a3cff30..8dd836a8dd60 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -8,41 +8,6 @@ #include "af_netlink.h" -#ifdef CONFIG_NETLINK_MMAP -static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, - struct sk_buff *nlskb) -{ - struct netlink_diag_ring ndr; - - ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; - ndr.ndr_block_nr = ring->pg_vec_len; - ndr.ndr_frame_size = ring->frame_size; - ndr.ndr_frame_nr = ring->frame_max + 1; - - return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); -} - -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - int ret; - - mutex_lock(&nlk->pg_vec_lock); - ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); - if (!ret) - ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, - nlskb); - mutex_unlock(&nlk->pg_vec_lock); - - return ret; -} -#else -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) -{ - return 0; -} -#endif - static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; - if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && - sk_diag_put_rings_cfg(sk, skb)) - goto out_nlmsg_trim; - nlmsg_end(skb, nlh); return 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d3f6b063467b..a09132a69869 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -185,7 +185,7 @@ static int genl_allocate_reserve_groups(int n_groups, int *first_id) } } - if (id + n_groups >= mc_groups_longs * BITS_PER_LONG) { + if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); @@ -463,26 +463,6 @@ int genl_unregister_family(struct genl_family *family) EXPORT_SYMBOL(genl_unregister_family); /** - * genlmsg_new_unicast - Allocate generic netlink message for unicast - * @payload: size of the message payload - * @info: information on destination - * @flags: the type of memory to allocate - * - * Allocates a new sk_buff large enough to cover the specified payload - * plus required Netlink headers. Will check receiving socket for - * memory mapped i/o capability and use it if enabled. Will fall back - * to non-mapped skb if message size exceeds the frame size of the ring. - */ -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags) -{ - size_t len = nlmsg_total_size(genlmsg_total_size(payload)); - - return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags); -} -EXPORT_SYMBOL_GPL(genlmsg_new_unicast); - -/** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to @@ -580,6 +560,10 @@ static int genl_family_rcv_msg(struct genl_family *family, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; + if ((ops->flags & GENL_UNS_ADMIN_PERM) && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { int rc; @@ -638,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; - info.dst_sk = skb->sk; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d143aa9f6654..cd5fd9d728a7 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -10,6 +10,7 @@ config OPENVSWITCH select LIBCRC32C select MPLS select NET_MPLS_GSO + select DST_CACHE ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c88d0f2d3e01..e9dd47b2a85b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, new_mpls_lse = (__be32 *)skb_mpls_header(skb); *new_mpls_lse = mpls->mpls_lse; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, - MPLS_HLEN, 0)); + skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); hdr = eth_hdr(skb); hdr->h_proto = mpls->mpls_ethertype; @@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, mask->eth_dst); - ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); @@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk /* Reconstruct the MAC header. */ skb_push(skb, data->l2_len); memcpy(skb->data, &data->l2_data, data->l2_len); - ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); skb_reset_mac_header(skb); ovs_vport_send(vport, skb); @@ -1160,17 +1158,26 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts, struct sw_flow_key *key) { - int level = this_cpu_read(exec_actions_level); - int err; + static const int ovs_recursion_limit = 5; + int err, level; + + level = __this_cpu_inc_return(exec_actions_level); + if (unlikely(level > ovs_recursion_limit)) { + net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", + ovs_dp_name(dp)); + kfree_skb(skb); + err = -ENETDOWN; + goto out; + } - this_cpu_inc(exec_actions_level); err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); - if (!level) + if (level == 1) process_deferred_actions(dp); - this_cpu_dec(exec_actions_level); +out: + __this_cpu_dec(exec_actions_level); return err; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 91a8b004dc51..e6a7d494df24 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -336,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; int err; - ovs_cb = *OVS_CB(skb); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) @@ -359,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, /* Queue all of the segments. */ skb = segs; do { - *OVS_CB(skb) = ovs_cb; if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -425,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; - struct genl_info info = { - .dst_sk = ovs_dp_get_net(dp)->genl_sock, - .snd_portid = upcall_info->portid, - }; size_t len; unsigned int hlen; int err, dp_ifindex; @@ -469,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, hlen = skb->len; len = upcall_msg_size(upcall_info, hlen); - user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -657,7 +650,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = packet_policy, .doit = ovs_packet_cmd_execute } @@ -879,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); - skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -1394,12 +1387,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_del }, @@ -1410,7 +1403,7 @@ static const struct genl_ops dp_flow_genl_ops[] = { .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_set, }, @@ -1484,9 +1477,9 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +static struct sk_buff *ovs_dp_cmd_alloc_info(void) { - return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); + return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ @@ -1539,7 +1532,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1660,7 +1653,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1693,7 +1686,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1726,7 +1719,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; - reply = ovs_dp_cmd_alloc_info(info); + reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; @@ -1780,12 +1773,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_del }, @@ -1796,7 +1789,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = { .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, .doit = ovs_dp_cmd_set, }, @@ -1915,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net, return ERR_PTR(-EINVAL); } +/* Called with ovs_mutex */ +static void update_headroom(struct datapath *dp) +{ + unsigned dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + dp->max_headroom = max_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + netdev_set_rx_headroom(vport->dev, max_headroom); +} + static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1980,6 +1996,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); + + if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) + update_headroom(dp); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + BUG_ON(err < 0); ovs_unlock(); @@ -2046,8 +2068,10 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool must_update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; int err; @@ -2069,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL); BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + must_update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); + + if (must_update_headroom) + update_headroom(dp); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); @@ -2161,12 +2194,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_del }, @@ -2177,7 +2210,7 @@ static const struct genl_ops dp_vport_genl_ops[] = { .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, .doit = ovs_vport_cmd_set, }, diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 67bdecd9fdc1..427e39a045cf 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -68,6 +68,8 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @max_headroom: the maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -89,6 +91,8 @@ struct datapath { possible_net_t net; u32 user_features; + + u32 max_headroom; }; /** diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d1bd4a45ca2d..58b8efc23668 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (!tun_dst) return -ENOMEM; + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, sizeof(*ovs_tun), log); if (IS_ERR(a)) { diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index ec76398a792f..83a5534abd31 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } +void internal_set_rx_headroom(struct net_device *dev, int new_hr) +{ + dev->needed_headroom = new_hr; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, @@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, + .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_PHONY_HEADROOM; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } + vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 6a6adf314363..4e3972344aa6 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb) return; skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 1605691d9414..5eb7694348b5 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -90,7 +90,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) int err; struct vxlan_config conf = { .no_share = true, - .flags = VXLAN_F_COLLECT_METADATA, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + /* Don't restrict the packets that can be sent by MTU */ + .mtu = IP_MAX_MTU, }; if (!options) { diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index c10899cb9040..f01f28a567ad 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv) int ovs_vport_receive(struct vport *, struct sk_buff *, const struct ip_tunnel_info *); -static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); -} - static inline const char *ovs_vport_name(struct vport *vport) { return vport->dev->name; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 992396aa635c..d41b1074cb2d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -557,9 +557,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, { struct net_device *dev; unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; - struct ethtool_cmd ecmd; + struct ethtool_link_ksettings ecmd; int err; - u32 speed; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); @@ -567,19 +566,19 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po, rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } - err = __ethtool_get_settings(dev, &ecmd); - speed = ethtool_cmd_speed(&ecmd); + err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (!err) { /* * If the link speed is so slow you don't really * need to worry about perf anyways */ - if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { + if (ecmd.base.speed < SPEED_1000 || + ecmd.base.speed == SPEED_UNKNOWN) { return DEFAULT_PRB_RETIRE_TOV; } else { msec = 1; - div = speed / 1000; + div = ecmd.base.speed / 1000; } } @@ -1960,6 +1959,64 @@ static unsigned int run_filter(struct sk_buff *skb, return res; } +static int __packet_rcv_vnet(const struct sk_buff *skb, + struct virtio_net_hdr *vnet_hdr) +{ + *vnet_hdr = (const struct virtio_net_hdr) { 0 }; + + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + /* This is a hint as to how much should be linear. */ + vnet_hdr->hdr_len = + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); + vnet_hdr->gso_size = + __cpu_to_virtio16(vio_le(), sinfo->gso_size); + + if (sinfo->gso_type & SKB_GSO_TCPV4) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; + else if (sinfo->gso_type & SKB_GSO_FCOE) + return -EINVAL; + else + BUG(); + + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(), + skb_checksum_start_offset(skb)); + vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(), + skb->csum_offset); + } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; + } /* else everything is zero */ + + return 0; +} + +static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, + size_t *len) +{ + struct virtio_net_hdr vnet_hdr; + + if (*len < sizeof(vnet_hdr)) + return -EINVAL; + *len -= sizeof(vnet_hdr); + + if (__packet_rcv_vnet(skb, &vnet_hdr)) + return -EINVAL; + + return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); +} + /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. @@ -2148,7 +2205,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + - po->tp_reserve; + po->tp_reserve; + if (po->has_vnet_hdr) + netoff += sizeof(struct virtio_net_hdr); macoff = netoff - maclen; } if (po->tp_version <= TPACKET_V2) { @@ -2185,7 +2244,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) - goto ring_is_full; + goto drop_n_account; if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* @@ -2204,6 +2263,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } spin_unlock(&sk->sk_receive_queue.lock); + if (po->has_vnet_hdr) { + if (__packet_rcv_vnet(skb, h.raw + macoff - + sizeof(struct virtio_net_hdr))) { + spin_lock(&sk->sk_receive_queue.lock); + goto drop_n_account; + } + } + skb_copy_bits(skb, 0, h.raw + macoff, snaplen); if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) @@ -2299,7 +2366,7 @@ drop: kfree_skb(skb); return 0; -ring_is_full: +drop_n_account: po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); @@ -2347,15 +2414,92 @@ static void tpacket_set_protocol(const struct net_device *dev, } } +static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) +{ + unsigned short gso_type = 0; + + if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) + vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); + + if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) + return -EINVAL; + + if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + gso_type = SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + gso_type = SKB_GSO_UDP; + break; + default: + return -EINVAL; + } + + if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + gso_type |= SKB_GSO_TCP_ECN; + + if (vnet_hdr->gso_size == 0) + return -EINVAL; + } + + vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */ + return 0; +} + +static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, + struct virtio_net_hdr *vnet_hdr) +{ + int n; + + if (*len < sizeof(*vnet_hdr)) + return -EINVAL; + *len -= sizeof(*vnet_hdr); + + n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter); + if (n != sizeof(*vnet_hdr)) + return -EFAULT; + + return __packet_snd_vnet_parse(vnet_hdr, *len); +} + +static int packet_snd_vnet_gso(struct sk_buff *skb, + struct virtio_net_hdr *vnet_hdr) +{ + if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset); + + if (!skb_partial_csum_set(skb, s, o)) + return -EINVAL; + } + + skb_shinfo(skb)->gso_size = + __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size); + skb_shinfo(skb)->gso_type = vnet_hdr->gso_type; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + return 0; +} + static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, - void *frame, struct net_device *dev, int size_max, - __be16 proto, unsigned char *addr, int hlen) + void *frame, struct net_device *dev, void *data, int tp_len, + __be16 proto, unsigned char *addr, int hlen, int copylen) { union tpacket_uhdr ph; - int to_write, offset, len, tp_len, nr_frags, len_max; + int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; - void *data; int err; ph.raw = frame; @@ -2367,51 +2511,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; - switch (po->tp_version) { - case TPACKET_V2: - tp_len = ph.h2->tp_len; - break; - default: - tp_len = ph.h1->tp_len; - break; - } - if (unlikely(tp_len > size_max)) { - pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); - return -EMSGSIZE; - } - skb_reserve(skb, hlen); skb_reset_network_header(skb); - if (unlikely(po->tp_tx_has_off)) { - int off_min, off_max, off; - off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); - off_max = po->tx_ring.frame_size - tp_len; - if (sock->type == SOCK_DGRAM) { - switch (po->tp_version) { - case TPACKET_V2: - off = ph.h2->tp_net; - break; - default: - off = ph.h1->tp_net; - break; - } - } else { - switch (po->tp_version) { - case TPACKET_V2: - off = ph.h2->tp_mac; - break; - default: - off = ph.h1->tp_mac; - break; - } - } - if (unlikely((off < off_min) || (off_max < off))) - return -EINVAL; - data = ph.raw + off; - } else { - data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); - } to_write = tp_len; if (sock->type == SOCK_DGRAM) { @@ -2419,20 +2521,17 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; - } else if (dev->hard_header_len) { - if (ll_header_truncated(dev, tp_len)) - return -EINVAL; - + } else if (copylen) { skb_push(skb, dev->hard_header_len); - err = skb_store_bits(skb, 0, data, - dev->hard_header_len); + skb_put(skb, copylen - dev->hard_header_len); + err = skb_store_bits(skb, 0, data, copylen); if (unlikely(err)) return err; if (!skb->protocol) tpacket_set_protocol(dev, skb); - data += dev->hard_header_len; - to_write -= dev->hard_header_len; + data += copylen; + to_write -= copylen; } offset = offset_in_page(data); @@ -2469,10 +2568,66 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, return tp_len; } +static int tpacket_parse_header(struct packet_sock *po, void *frame, + int size_max, void **data) +{ + union tpacket_uhdr ph; + int tp_len, off; + + ph.raw = frame; + + switch (po->tp_version) { + case TPACKET_V2: + tp_len = ph.h2->tp_len; + break; + default: + tp_len = ph.h1->tp_len; + break; + } + if (unlikely(tp_len > size_max)) { + pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); + return -EMSGSIZE; + } + + if (unlikely(po->tp_tx_has_off)) { + int off_min, off_max; + + off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); + off_max = po->tx_ring.frame_size - tp_len; + if (po->sk.sk_type == SOCK_DGRAM) { + switch (po->tp_version) { + case TPACKET_V2: + off = ph.h2->tp_net; + break; + default: + off = ph.h1->tp_net; + break; + } + } else { + switch (po->tp_version) { + case TPACKET_V2: + off = ph.h2->tp_mac; + break; + default: + off = ph.h1->tp_mac; + break; + } + } + if (unlikely((off < off_min) || (off_max < off))) + return -EINVAL; + } else { + off = po->tp_hdrlen - sizeof(struct sockaddr_ll); + } + + *data = frame + off; + return tp_len; +} + static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb; struct net_device *dev; + struct virtio_net_hdr *vnet_hdr = NULL; __be16 proto; int err, reserve = 0; void *ph; @@ -2480,9 +2635,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int tp_len, size_max; unsigned char *addr; + void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; - int hlen, tlen; + int hlen, tlen, copylen = 0; mutex_lock(&po->pg_vec_lock); @@ -2515,7 +2671,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if (size_max > dev->mtu + reserve + VLAN_HLEN) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) size_max = dev->mtu + reserve + VLAN_HLEN; do { @@ -2527,11 +2683,36 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) continue; } + skb = NULL; + tp_len = tpacket_parse_header(po, ph, size_max, &data); + if (tp_len < 0) + goto tpacket_error; + status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; + if (po->has_vnet_hdr) { + vnet_hdr = data; + data += sizeof(*vnet_hdr); + tp_len -= sizeof(*vnet_hdr); + if (tp_len < 0 || + __packet_snd_vnet_parse(vnet_hdr, tp_len)) { + tp_len = -EINVAL; + goto tpacket_error; + } + copylen = __virtio16_to_cpu(vio_le(), + vnet_hdr->hdr_len); + } + if (dev->hard_header_len) { + if (ll_header_truncated(dev, tp_len)) { + tp_len = -EINVAL; + goto tpacket_error; + } + copylen = max_t(int, copylen, dev->hard_header_len); + } skb = sock_alloc_send_skb(&po->sk, - hlen + tlen + sizeof(struct sockaddr_ll), + hlen + tlen + sizeof(struct sockaddr_ll) + + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { @@ -2540,14 +2721,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) err = len_sum; goto out_status; } - tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, - addr, hlen); + tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, + addr, hlen, copylen); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && + !po->has_vnet_hdr && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { +tpacket_error: if (po->tp_loss) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); @@ -2561,6 +2744,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } + if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) { + tp_len = -EINVAL; + goto tpacket_error; + } + packet_pick_tx_queue(dev, skb); skb->destructor = tpacket_destruct_skb; @@ -2643,12 +2831,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; - int vnet_hdr_len; struct packet_sock *po = pkt_sk(sk); - unsigned short gso_type = 0; int hlen, tlen; int extra_len = 0; - ssize_t n; /* * Get and verify the address. @@ -2686,53 +2871,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { - vnet_hdr_len = sizeof(vnet_hdr); - - err = -EINVAL; - if (len < vnet_hdr_len) - goto out_unlock; - - len -= vnet_hdr_len; - - err = -EFAULT; - n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter); - if (n != vnet_hdr_len) - goto out_unlock; - - if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + - __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); - - err = -EINVAL; - if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); + if (err) goto out_unlock; - - if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { - switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { - case VIRTIO_NET_HDR_GSO_TCPV4: - gso_type = SKB_GSO_TCPV4; - break; - case VIRTIO_NET_HDR_GSO_TCPV6: - gso_type = SKB_GSO_TCPV6; - break; - case VIRTIO_NET_HDR_GSO_UDP: - gso_type = SKB_GSO_UDP; - break; - default: - goto out_unlock; - } - - if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) - gso_type |= SKB_GSO_TCP_ECN; - - if (vnet_hdr.gso_size == 0) - goto out_unlock; - - } } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2744,7 +2885,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } err = -EMSGSIZE; - if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) + if (!vnet_hdr.gso_type && + (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; @@ -2775,7 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (!gso_type && (len > dev->mtu + reserve + extra_len) && + if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; @@ -2789,24 +2931,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { - if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); - if (!skb_partial_csum_set(skb, s, o)) { - err = -EINVAL; - goto out_free; - } - } - - skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); - skb_shinfo(skb)->gso_type = gso_type; - - /* Header must be checked, and gso_segs computed. */ - skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; - skb_shinfo(skb)->gso_segs = 0; - - len += vnet_hdr_len; + err = packet_snd_vnet_gso(skb, &vnet_hdr); + if (err) + goto out_free; + len += sizeof(vnet_hdr); } skb_probe_transport_header(skb, reserve); @@ -3177,51 +3305,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_has_room(pkt_sk(sk), NULL); if (pkt_sk(sk)->has_vnet_hdr) { - struct virtio_net_hdr vnet_hdr = { 0 }; - - err = -EINVAL; - vnet_hdr_len = sizeof(vnet_hdr); - if (len < vnet_hdr_len) - goto out_free; - - len -= vnet_hdr_len; - - if (skb_is_gso(skb)) { - struct skb_shared_info *sinfo = skb_shinfo(skb); - - /* This is a hint as to how much should be linear. */ - vnet_hdr.hdr_len = - __cpu_to_virtio16(vio_le(), skb_headlen(skb)); - vnet_hdr.gso_size = - __cpu_to_virtio16(vio_le(), sinfo->gso_size); - if (sinfo->gso_type & SKB_GSO_TCPV4) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - else if (sinfo->gso_type & SKB_GSO_TCPV6) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; - else if (sinfo->gso_type & SKB_GSO_UDP) - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; - else if (sinfo->gso_type & SKB_GSO_FCOE) - goto out_free; - else - BUG(); - if (sinfo->gso_type & SKB_GSO_TCP_ECN) - vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; - } else - vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), - skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), - skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; - } /* else everything is zero */ - - err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len); - if (err < 0) + err = packet_rcv_vnet(msg, skb, &len); + if (err) goto out_free; + vnet_hdr_len = sizeof(struct virtio_net_hdr); } /* You lose any data beyond the buffer you gave. If it worries @@ -3552,8 +3639,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv } if (optlen < len) return -EINVAL; - if (pkt_sk(sk)->has_vnet_hdr) - return -EINVAL; if (copy_from_user(&req_u.req, optval, len)) return -EFAULT; return packet_set_ring(sk, &req_u, 0, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index d575ef4e9aa6..ffd5f2297584 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) rcu_read_unlock(); } -void pn_sock_hash(struct sock *sk) +int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); + + return 0; } EXPORT_SYMBOL(pn_sock_hash); @@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) pn->resource = spn->spn_resource; /* Enable RX on the socket */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: diff --git a/net/rds/ib.c b/net/rds/ib.c index f222885ac0c7..9481d55ff6cb 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct ib_device_attr *dev_attr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) - goto free_attr; + return; spin_lock_init(&rds_ibdev->spinlock); atomic_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); - rds_ibdev->max_wrs = dev_attr->max_qp_wr; - rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; + rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? - min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; + rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; - rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? - min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; - rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; - rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); @@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device) } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", - dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, rds_ibdev->max_8k_fmrs); @@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device) put_dev: rds_ib_dev_put(rds_ibdev); -free_attr: - kfree(dev_attr); } /* diff --git a/net/rds/iw.c b/net/rds/iw.c index 576f1825fc55..f4a9fff829e0 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns); static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; - struct ib_device_attr *dev_attr; /* Only handle iwarp devices */ if (device->node_type != RDMA_NODE_RNIC) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); if (!rds_iwdev) - goto free_attr; + return; spin_lock_init(&rds_iwdev->spinlock); - rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = dev_attr->max_qp_wr; - rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = device->attrs.max_qp_wr; + rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); rds_iwdev->dev = device; rds_iwdev->pd = ib_alloc_pd(device); @@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device) list_add_tail(&rds_iwdev->list, &rds_iw_devices); ib_set_client_data(device, &rds_iw_client, rds_iwdev); - - goto free_attr; + return; err_mr: if (rds_iwdev->mr) @@ -121,8 +110,6 @@ err_pd: ib_dealloc_pd(rds_iwdev->pd); free_dev: kfree(rds_iwdev); -free_attr: - kfree(dev_attr); } static void rds_iw_remove_one(struct ib_device *device, void *client_data) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9d6ddbacd875..ad60299b088b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -37,7 +37,6 @@ #include <net/tcp.h> #include <net/net_namespace.h> #include <net/netns/generic.h> -#include <net/tcp.h> #include "rds.h" #include "tcp.h" diff --git a/net/rfkill/core.c b/net/rfkill/core.c index a8c05e18da58..03f26e3a6f48 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -1094,17 +1094,6 @@ static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait) return res; } -static bool rfkill_readable(struct rfkill_data *data) -{ - bool r; - - mutex_lock(&data->mtx); - r = !list_empty(&data->events); - mutex_unlock(&data->mtx); - - return r; -} - static ssize_t rfkill_fop_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { @@ -1121,8 +1110,11 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf, goto out; } mutex_unlock(&data->mtx); + /* since we re-check and it just compares pointers, + * using !list_empty() without locking isn't a problem + */ ret = wait_event_interruptible(data->read_wait, - rfkill_readable(data)); + !list_empty(&data->events)); mutex_lock(&data->mtx); if (ret) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 06e7c4a37245..96066665e376 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,10 +36,9 @@ static void free_tcf(struct rcu_head *head) kfree(p); } -static void tcf_hash_destroy(struct tc_action *a) +static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) { struct tcf_common *p = a->priv; - struct tcf_hashinfo *hinfo = a->ops->hinfo; spin_lock_bh(&hinfo->lock); hlist_del(&p->tcfc_head); @@ -68,8 +67,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { if (a->ops->cleanup) a->ops->cleanup(a, bind); - tcf_hash_destroy(a); - ret = 1; + tcf_hash_destroy(a->hinfo, a); + ret = ACT_P_DELETED; } } @@ -77,10 +76,9 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) } EXPORT_SYMBOL(__tcf_hash_release); -static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, - struct tc_action *a) +static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, + struct netlink_callback *cb, struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; @@ -126,9 +124,9 @@ nla_put_failure: goto done; } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) +static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, + struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct hlist_head *head; struct hlist_node *n; struct tcf_common *p; @@ -163,18 +161,24 @@ nla_put_failure: return ret; } -static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) { + struct tcf_hashinfo *hinfo = tn->hinfo; + + a->hinfo = hinfo; + if (type == RTM_DELACTION) { - return tcf_del_walker(skb, a); + return tcf_del_walker(hinfo, skb, a); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(skb, cb, a); + return tcf_dump_walker(hinfo, skb, cb, a); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; } } +EXPORT_SYMBOL(tcf_generic_walker); static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { @@ -191,8 +195,9 @@ static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) return p; } -u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tc_action_net *tn) { + struct tcf_hashinfo *hinfo = tn->hinfo; u32 val = hinfo->index; do { @@ -205,28 +210,31 @@ u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo) } EXPORT_SYMBOL(tcf_hash_new_index); -int tcf_hash_search(struct tc_action *a, u32 index) +int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = tcf_hash_lookup(index, hinfo); if (p) { a->priv = p; + a->hinfo = hinfo; return 1; } return 0; } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(u32 index, struct tc_action *a, int bind) +int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = NULL; if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; a->priv = p; + a->hinfo = hinfo; return 1; } return 0; @@ -243,11 +251,11 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) } EXPORT_SYMBOL(tcf_hash_cleanup); -int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind, bool cpustats) +int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + struct tc_action *a, int size, int bind, bool cpustats) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; if (unlikely(!p)) @@ -272,7 +280,7 @@ err2: } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); - p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); + p->tcfc_index = index ? index : tcf_hash_new_index(tn); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { @@ -286,14 +294,15 @@ err2: } a->priv = (void *) p; + a->hinfo = hinfo; return 0; } EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tc_action *a) +void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_common *p = a->priv; - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_hashinfo *hinfo = tn->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); spin_lock_bh(&hinfo->lock); @@ -302,59 +311,78 @@ void tcf_hash_insert(struct tc_action *a) } EXPORT_SYMBOL(tcf_hash_insert); +void tcf_hashinfo_destroy(const struct tc_action_ops *ops, + struct tcf_hashinfo *hinfo) +{ + struct tc_action a = { + .ops = ops, + .hinfo = hinfo, + }; + int i; + + for (i = 0; i < hinfo->hmask + 1; i++) { + struct tcf_common *p; + struct hlist_node *n; + + hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { + int ret; + + a.priv = p; + ret = __tcf_hash_release(&a, false, true); + if (ret == ACT_P_DELETED) + module_put(ops->owner); + else if (ret < 0) + return; + } + } + kfree(hinfo->htab); +} +EXPORT_SYMBOL(tcf_hashinfo_destroy); + static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act, unsigned int mask) +int tcf_register_action(struct tc_action_ops *act, + struct pernet_operations *ops) { struct tc_action_ops *a; - int err; + int ret; - /* Must supply act, dump and init */ - if (!act->act || !act->dump || !act->init) + if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) return -EINVAL; - /* Supply defaults */ - if (!act->lookup) - act->lookup = tcf_hash_search; - if (!act->walk) - act->walk = tcf_generic_walker; - - act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); - if (!act->hinfo) - return -ENOMEM; - err = tcf_hashinfo_init(act->hinfo, mask); - if (err) { - kfree(act->hinfo); - return err; - } - write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); - tcf_hashinfo_destroy(act->hinfo); - kfree(act->hinfo); return -EEXIST; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); + + ret = register_pernet_subsys(ops); + if (ret) { + tcf_unregister_action(act, ops); + return ret; + } + return 0; } EXPORT_SYMBOL(tcf_register_action); -int tcf_unregister_action(struct tc_action_ops *act) +int tcf_unregister_action(struct tc_action_ops *act, + struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; + unregister_pernet_subsys(ops); + write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); - tcf_hashinfo_destroy(act->hinfo); - kfree(act->hinfo); err = 0; break; } @@ -721,8 +749,8 @@ static struct tc_action *create_a(int i) return act; } -static struct tc_action * -tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) +static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; @@ -749,7 +777,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid) if (a->ops == NULL) /* could happen in batch of actions */ goto err_free; err = -ENOENT; - if (a->ops->lookup(a, index) == 0) + if (a->ops->lookup(net, a, index) == 0) goto err_mod; module_put(a->ops->owner); @@ -819,7 +847,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a); + err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a); if (err < 0) goto out_module_put; if (err == 0) @@ -897,7 +925,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, portid); + act = tcf_action_get_1(net, tb[i], n, portid); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; @@ -1044,6 +1072,7 @@ find_dump_kind(const struct nlmsghdr *n) static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -1078,7 +1107,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(skb, cb, RTM_GETACTION, &a); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 0bc6f912f870..8c9f1f0459ab 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -33,6 +33,8 @@ struct tcf_bpf_cfg { bool is_ebpf; }; +static int bpf_net_id; + static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { @@ -275,6 +277,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *act, int replace, int bind) { + struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; @@ -294,8 +297,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_hash_check(parm->index, act, bind)) { - ret = tcf_hash_create(parm->index, est, act, + if (!tcf_hash_check(tn, parm->index, act, bind)) { + ret = tcf_hash_create(tn, parm->index, est, act, sizeof(*prog), bind, true); if (ret < 0) return ret; @@ -344,7 +347,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(prog->filter, cfg.filter); if (res == ACT_P_CREATED) { - tcf_hash_insert(act); + tcf_hash_insert(tn, act); } else { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); @@ -367,6 +370,22 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind) tcf_bpf_cfg_cleanup(&tmp); } +static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -375,16 +394,39 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, + .walk = tcf_bpf_walker, + .lookup = tcf_bpf_search, +}; + +static __net_init int bpf_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK); +} + +static void __net_exit bpf_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations bpf_net_ops = { + .init = bpf_init_net, + .exit = bpf_exit_net, + .id = &bpf_net_id, + .size = sizeof(struct tc_action_net), }; static int __init bpf_init_module(void) { - return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK); + return tcf_register_action(&act_bpf_ops, &bpf_net_ops); } static void __exit bpf_cleanup_module(void) { - tcf_unregister_action(&act_bpf_ops); + tcf_unregister_action(&act_bpf_ops, &bpf_net_ops); } module_init(bpf_init_module); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index bb41699c6c49..c0ed93ce2391 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -30,6 +30,8 @@ #define CONNMARK_TAB_MASK 3 +static int connmark_net_id; + static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -97,6 +99,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; struct tcf_connmark_info *ci; struct tc_connmark *parm; @@ -111,9 +114,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*ci), bind, false); if (ret) return ret; @@ -122,7 +125,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci->net = net; ci->zone = parm->zone; - tcf_hash_insert(a); + tcf_hash_insert(tn, a); ret = ACT_P_CREATED; } else { ci = to_connmark(a); @@ -169,6 +172,22 @@ nla_put_failure: return -1; } +static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -176,16 +195,39 @@ static struct tc_action_ops act_connmark_ops = { .act = tcf_connmark, .dump = tcf_connmark_dump, .init = tcf_connmark_init, + .walk = tcf_connmark_walker, + .lookup = tcf_connmark_search, +}; + +static __net_init int connmark_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK); +} + +static void __net_exit connmark_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations connmark_net_ops = { + .init = connmark_init_net, + .exit = connmark_exit_net, + .id = &connmark_net_id, + .size = sizeof(struct tc_action_net), }; static int __init connmark_init_module(void) { - return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK); + return tcf_register_action(&act_connmark_ops, &connmark_net_ops); } static void __exit connmark_cleanup_module(void) { - tcf_unregister_action(&act_connmark_ops); + tcf_unregister_action(&act_connmark_ops, &connmark_net_ops); } module_init(connmark_init_module); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b07c535ba8e7..d22426cdebc0 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -42,9 +42,13 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int csum_net_id; + +static int tcf_csum_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { + struct tc_action_net *tn = net_generic(net, csum_net_id); struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -61,9 +65,9 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -82,7 +86,7 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -105,9 +109,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, int hl = ihl + jhl; if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || - (skb_cloned(skb) && - !skb_clone_writable(skb, hl + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, hl + ntkoff)) return NULL; else return (void *)(skb_network_header(skb) + ihl); @@ -365,9 +367,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff)) goto fail; ip_send_check(ip_hdr(skb)); @@ -559,6 +559,22 @@ nla_put_failure: return -1; } +static int tcf_csum_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -566,6 +582,29 @@ static struct tc_action_ops act_csum_ops = { .act = tcf_csum, .dump = tcf_csum_dump, .init = tcf_csum_init, + .walk = tcf_csum_walker, + .lookup = tcf_csum_search, +}; + +static __net_init int csum_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK); +} + +static void __net_exit csum_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations csum_net_ops = { + .init = csum_init_net, + .exit = csum_exit_net, + .id = &csum_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Checksum updating actions"); @@ -573,12 +612,12 @@ MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { - return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK); + return tcf_register_action(&act_csum_ops, &csum_net_ops); } static void __exit csum_cleanup_module(void) { - tcf_unregister_action(&act_csum_ops); + tcf_unregister_action(&act_csum_ops, &csum_net_ops); } module_init(csum_init_module); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 5c1b05170736..887fc1f209ff 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -25,6 +25,8 @@ #define GACT_TAB_MASK 15 +static int gact_net_id; + #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { @@ -57,6 +59,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; struct tcf_gact *gact; @@ -88,9 +91,9 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), - bind, true); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*gact), bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -118,7 +121,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -183,6 +186,22 @@ nla_put_failure: return -1; } +static int tcf_gact_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -190,6 +209,29 @@ static struct tc_action_ops act_gact_ops = { .act = tcf_gact, .dump = tcf_gact_dump, .init = tcf_gact_init, + .walk = tcf_gact_walker, + .lookup = tcf_gact_search, +}; + +static __net_init int gact_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK); +} + +static void __net_exit gact_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations gact_net_ops = { + .init = gact_init_net, + .exit = gact_exit_net, + .id = &gact_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -203,12 +245,13 @@ static int __init gact_init_module(void) #else pr_info("GACT probability NOT on\n"); #endif - return tcf_register_action(&act_gact_ops, GACT_TAB_MASK); + + return tcf_register_action(&act_gact_ops, &gact_net_ops); } static void __exit gact_cleanup_module(void) { - tcf_unregister_action(&act_gact_ops); + tcf_unregister_action(&act_gact_ops, &gact_net_ops); } module_init(gact_init_module); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d05869646515..89c41a1f3589 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -30,6 +30,10 @@ #define IPT_TAB_MASK 15 +static int ipt_net_id; + +static int xt_net_id; + static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { struct xt_tgchk_param par; @@ -83,8 +87,9 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; @@ -113,8 +118,9 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); + if (!tcf_hash_check(tn, index, a, bind)) { + ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, + false); if (ret) return ret; ret = ACT_P_CREATED; @@ -157,7 +163,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; err3: @@ -170,6 +176,24 @@ err1: return err; } +static int tcf_ipt_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return __tcf_ipt_init(tn, nla, est, a, ovr, bind); +} + +static int tcf_xt_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return __tcf_ipt_init(tn, nla, est, a, ovr, bind); +} + static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -260,6 +284,22 @@ nla_put_failure: return -1; } +static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -268,8 +308,47 @@ static struct tc_action_ops act_ipt_ops = { .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, + .walk = tcf_ipt_walker, + .lookup = tcf_ipt_search, +}; + +static __net_init int ipt_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK); +} + +static void __net_exit ipt_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations ipt_net_ops = { + .init = ipt_init_net, + .exit = ipt_exit_net, + .id = &ipt_net_id, + .size = sizeof(struct tc_action_net), }; +static int tcf_xt_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -277,7 +356,30 @@ static struct tc_action_ops act_xt_ops = { .act = tcf_ipt, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, - .init = tcf_ipt_init, + .init = tcf_xt_init, + .walk = tcf_xt_walker, + .lookup = tcf_xt_search, +}; + +static __net_init int xt_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK); +} + +static void __net_exit xt_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations xt_net_ops = { + .init = xt_init_net, + .exit = xt_exit_net, + .id = &xt_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); @@ -289,12 +391,13 @@ static int __init ipt_init_module(void) { int ret1, ret2; - ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); + ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops); if (ret1 < 0) - printk("Failed to load xt action\n"); - ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); + pr_err("Failed to load xt action\n"); + + ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops); if (ret2 < 0) - printk("Failed to load ipt action\n"); + pr_err("Failed to load ipt action\n"); if (ret1 < 0 && ret2 < 0) { return ret1; @@ -304,8 +407,8 @@ static int __init ipt_init_module(void) static void __exit ipt_cleanup_module(void) { - tcf_unregister_action(&act_xt_ops); - tcf_unregister_action(&act_ipt_ops); + tcf_unregister_action(&act_ipt_ops, &ipt_net_ops); + tcf_unregister_action(&act_xt_ops, &xt_net_ops); } module_init(ipt_init_module); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 32fcdecdb9e2..6b284d991e0b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -50,10 +50,13 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; +static int mirred_net_id; + static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; struct tcf_mirred *m; @@ -96,11 +99,11 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev = NULL; } - if (!tcf_hash_check(parm->index, a, bind)) { + if (!tcf_hash_check(tn, parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), - bind, true); + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*m), bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -130,7 +133,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock_bh(&mirred_list_lock); - tcf_hash_insert(a); + tcf_hash_insert(tn, a); } return ret; @@ -221,6 +224,22 @@ nla_put_failure: return -1; } +static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_hash_search(tn, a, index); +} + static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -257,6 +276,29 @@ static struct tc_action_ops act_mirred_ops = { .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, + .walk = tcf_mirred_walker, + .lookup = tcf_mirred_search, +}; + +static __net_init int mirred_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK); +} + +static void __net_exit mirred_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations mirred_net_ops = { + .init = mirred_init_net, + .exit = mirred_exit_net, + .id = &mirred_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); @@ -270,12 +312,12 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK); + return tcf_register_action(&act_mirred_ops, &mirred_net_ops); } static void __exit mirred_cleanup_module(void) { - tcf_unregister_action(&act_mirred_ops); + tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b7c4ead8b5a8..0f65cdfbfb1d 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -31,6 +31,8 @@ #define NAT_TAB_MASK 15 +static int nat_net_id; + static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; @@ -38,6 +40,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; struct tc_nat *parm; int ret = 0, err; @@ -54,9 +57,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; ret = ACT_P_CREATED; @@ -79,7 +82,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -126,9 +129,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, addr = iph->daddr; if (!((old_addr ^ addr) & mask)) { - if (skb_cloned(skb) && - !skb_clone_writable(skb, sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, sizeof(*iph) + noff)) goto drop; new_addr &= mask; @@ -156,9 +157,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcphdr *tcph; if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff)) goto drop; tcph = (void *)(skb_network_header(skb) + ihl); @@ -171,9 +170,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct udphdr *udph; if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || - (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + skb_try_make_writable(skb, ihl + sizeof(*udph) + noff)) goto drop; udph = (void *)(skb_network_header(skb) + ihl); @@ -213,10 +210,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, if ((old_addr ^ addr) & mask) break; - if (skb_cloned(skb) && - !skb_clone_writable(skb, ihl + sizeof(*icmph) + - sizeof(*iph) + noff) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_try_make_writable(skb, ihl + sizeof(*icmph) + + sizeof(*iph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); @@ -282,6 +277,22 @@ nla_put_failure: return -1; } +static int tcf_nat_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -289,6 +300,29 @@ static struct tc_action_ops act_nat_ops = { .act = tcf_nat, .dump = tcf_nat_dump, .init = tcf_nat_init, + .walk = tcf_nat_walker, + .lookup = tcf_nat_search, +}; + +static __net_init int nat_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK); +} + +static void __net_exit nat_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations nat_net_ops = { + .init = nat_init_net, + .exit = nat_exit_net, + .id = &nat_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Stateless NAT actions"); @@ -296,12 +330,12 @@ MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { - return tcf_register_action(&act_nat_ops, NAT_TAB_MASK); + return tcf_register_action(&act_nat_ops, &nat_net_ops); } static void __exit nat_cleanup_module(void) { - tcf_unregister_action(&act_nat_ops); + tcf_unregister_action(&act_nat_ops, &nat_net_ops); } module_init(nat_init_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e38a7701f154..429c3ab65142 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -25,6 +25,8 @@ #define PEDIT_TAB_MASK 15 +static int pedit_net_id; + static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; @@ -33,6 +35,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; int ret = 0, err; @@ -54,11 +57,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; - if (!tcf_hash_check(parm->index, a, bind)) { + if (!tcf_hash_check(tn, parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), - bind, false); + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*p), bind, false); if (ret) return ret; p = to_pedit(a); @@ -93,7 +96,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -211,6 +214,22 @@ nla_put_failure: return -1; } +static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -219,6 +238,29 @@ static struct tc_action_ops act_pedit_ops = { .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, + .walk = tcf_pedit_walker, + .lookup = tcf_pedit_search, +}; + +static __net_init int pedit_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK); +} + +static void __net_exit pedit_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations pedit_net_ops = { + .init = pedit_init_net, + .exit = pedit_exit_net, + .id = &pedit_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -227,12 +269,12 @@ MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { - return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK); + return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { - tcf_unregister_action(&act_pedit_ops); + tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 9a1c42a43f92..330f14e302e8 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -55,10 +55,14 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ -static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a) +static int police_net_id; + +static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) { - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_hashinfo *hinfo = tn->hinfo; struct hlist_head *head; struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; @@ -121,7 +125,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; - struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tc_action_net *tn = net_generic(net, police_net_id); + struct tcf_hashinfo *hinfo = tn->hinfo; int size; if (nla == NULL) @@ -139,7 +144,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - if (tcf_hash_search(a, parm->index)) { + if (tcf_hash_search(tn, a, parm->index)) { police = to_police(a->priv); if (bind) { police->tcf_bindcnt += 1; @@ -233,7 +238,7 @@ override: police->tcfp_t_c = ktime_get_ns(); police->tcf_index = parm->index ? parm->index : - tcf_hash_new_index(hinfo); + tcf_hash_new_index(tn); h = tcf_hash(police->tcf_index, POL_TAB_MASK); spin_lock_bh(&hinfo->lock); hlist_add_head(&police->tcf_head, &hinfo->htab[h]); @@ -342,6 +347,13 @@ nla_put_failure: return -1; } +static int tcf_police_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_hash_search(tn, a, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -353,19 +365,41 @@ static struct tc_action_ops act_police_ops = { .act = tcf_act_police, .dump = tcf_act_police_dump, .init = tcf_act_police_locate, - .walk = tcf_act_police_walker + .walk = tcf_act_police_walker, + .lookup = tcf_police_search, +}; + +static __net_init int police_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK); +} + +static void __net_exit police_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations police_net_ops = { + .init = police_init_net, + .exit = police_exit_net, + .id = &police_net_id, + .size = sizeof(struct tc_action_net), }; static int __init police_init_module(void) { - return tcf_register_action(&act_police_ops, POL_TAB_MASK); + return tcf_register_action(&act_police_ops, &police_net_ops); } static void __exit police_cleanup_module(void) { - tcf_unregister_action(&act_police_ops); + tcf_unregister_action(&act_police_ops, &police_net_ops); } module_init(police_init_module); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index d6b708d6afdf..75b2be13fbcc 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -26,6 +26,8 @@ #define SIMP_TAB_MASK 7 +static int simp_net_id; + #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -80,6 +82,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; @@ -102,9 +105,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_DEF_PARMS]); defdata = nla_data(tb[TCA_DEF_DATA]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*d), bind, false); if (ret) return ret; @@ -129,7 +132,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -161,6 +164,22 @@ nla_put_failure: return -1; } +static int tcf_simp_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -169,6 +188,29 @@ static struct tc_action_ops act_simp_ops = { .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, + .walk = tcf_simp_walker, + .lookup = tcf_simp_search, +}; + +static __net_init int simp_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK); +} + +static void __net_exit simp_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations simp_net_ops = { + .init = simp_init_net, + .exit = simp_exit_net, + .id = &simp_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -177,8 +219,7 @@ MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { - int ret; - ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK); + int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; @@ -186,7 +227,7 @@ static int __init simp_init_module(void) static void __exit simp_cleanup_module(void) { - tcf_unregister_action(&act_simp_ops); + tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6751b5f8c046..cfcdbdc00c9b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -29,6 +29,8 @@ #define SKBEDIT_TAB_MASK 15 +static int skbedit_net_id; + static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -61,6 +63,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -98,9 +101,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*d), bind, false); if (ret) return ret; @@ -130,7 +133,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -173,6 +176,22 @@ nla_put_failure: return -1; } +static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -180,6 +199,29 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .walk = tcf_skbedit_walker, + .lookup = tcf_skbedit_search, +}; + +static __net_init int skbedit_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK); +} + +static void __net_exit skbedit_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations skbedit_net_ops = { + .init = skbedit_init_net, + .exit = skbedit_exit_net, + .id = &skbedit_net_id, + .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); @@ -188,12 +230,12 @@ MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { - return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK); + return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops); } static void __exit skbedit_cleanup_module(void) { - tcf_unregister_action(&act_skbedit_ops); + tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops); } module_init(skbedit_init_module); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 796785e0bf96..bab8ae0cefc0 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -21,6 +21,8 @@ #define VLAN_TAB_MASK 15 +static int vlan_net_id; + static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -68,6 +70,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tc_vlan *parm; struct tcf_vlan *v; @@ -115,9 +118,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } action = parm->v_action; - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), - bind, false); + if (!tcf_hash_check(tn, parm->index, a, bind)) { + ret = tcf_hash_create(tn, parm->index, est, a, + sizeof(*v), bind, false); if (ret) return ret; @@ -143,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&v->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(tn, a); return ret; } @@ -181,6 +184,22 @@ nla_put_failure: return -1; } +static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_generic_walker(tn, skb, cb, type, a); +} + +static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_hash_search(tn, a, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -188,16 +207,39 @@ static struct tc_action_ops act_vlan_ops = { .act = tcf_vlan, .dump = tcf_vlan_dump, .init = tcf_vlan_init, + .walk = tcf_vlan_walker, + .lookup = tcf_vlan_search, +}; + +static __net_init int vlan_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK); +} + +static void __net_exit vlan_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations vlan_net_ops = { + .init = vlan_init_net, + .exit = vlan_exit_net, + .id = &vlan_net_id, + .size = sizeof(struct tc_action_net), }; static int __init vlan_init_module(void) { - return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK); + return tcf_register_action(&act_vlan_ops, &vlan_net_ops); } static void __exit vlan_cleanup_module(void) { - tcf_unregister_action(&act_vlan_ops); + tcf_unregister_action(&act_vlan_ops, &vlan_net_ops); } module_init(vlan_init_module); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4fbb67430ce4..563cdad76448 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -43,6 +43,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <linux/netdevice.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -58,6 +59,7 @@ struct tc_u_knode { #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt __percpu *pf; #endif + u32 flags; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -424,6 +426,97 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } +static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, 0)) { + offload.cls_u32->command = TC_CLSU32_DELETE_KNODE; + offload.cls_u32->knode.handle = handle; + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_hnode(struct tcf_proto *tp, + struct tc_u_hnode *h, + u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, flags)) { + offload.cls_u32->command = TC_CLSU32_NEW_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, 0)) { + offload.cls_u32->command = TC_CLSU32_DELETE_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + +static void u32_replace_hw_knode(struct tcf_proto *tp, + struct tc_u_knode *n, + u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_u32_offload u32_offload = {0}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSU32; + offload.cls_u32 = &u32_offload; + + if (tc_should_offload(dev, flags)) { + offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE; + offload.cls_u32->knode.handle = n->handle; + offload.cls_u32->knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + offload.cls_u32->knode.val = n->val; + offload.cls_u32->knode.mask = n->mask; +#else + offload.cls_u32->knode.val = 0; + offload.cls_u32->knode.mask = 0; +#endif + offload.cls_u32->knode.sel = &n->sel; + offload.cls_u32->knode.exts = &n->exts; + if (n->ht_down) + offload.cls_u32->knode.link_handle = n->ht_down->handle; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + } +} + static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; @@ -434,6 +527,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); + u32_remove_hw_knode(tp, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -454,6 +548,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { + u32_clear_hw_hnode(tp, ht); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -540,8 +635,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) if (ht == NULL) return 0; - if (TC_U32_KEY(ht->handle)) + if (TC_U32_KEY(ht->handle)) { + u32_remove_hw_knode(tp, ht->handle); return u32_delete_key(tp, (struct tc_u_knode *)ht); + } if (root_ht == ht) return -EINVAL; @@ -587,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, + [TCA_U32_FLAGS] = { .type = NLA_U32 }, }; static int u32_set_parms(struct net *net, struct tcf_proto *tp, @@ -694,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, #endif new->fshift = n->fshift; new->res = n->res; + new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, n->ht_down); /* bump reference count as long as we hold pointer to structure */ @@ -733,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; - u32 htid; + u32 htid, flags = 0; int err; #ifdef CONFIG_CLS_U32_PERF size_t size; @@ -746,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; + if (tb[TCA_U32_FLAGS]) + flags = nla_get_u32(tb[TCA_U32_FLAGS]); + n = (struct tc_u_knode *)*arg; if (n) { struct tc_u_knode *new; @@ -753,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; + if (n->flags != flags) + return -EINVAL; + new = u32_init_knode(tp, n); if (!new) return -ENOMEM; @@ -769,6 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); + u32_replace_hw_knode(tp, new, flags); return 0; } @@ -795,6 +901,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; + + u32_replace_hw_hnode(tp, ht, flags); return 0; } @@ -845,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + n->flags = flags; tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); n->tp = tp; @@ -877,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); - + u32_replace_hw_knode(tp, n, flags); *arg = (unsigned long)n; return 0; } @@ -982,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; + if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags)) + goto nla_put_failure; + #ifdef CONFIG_CLS_U32_MARK if ((n->val || n->mask)) { struct tc_u32_mark mark = {.val = n->val, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b5c2cf2aa6d4..3b180ff72f79 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -744,14 +744,15 @@ static u32 qdisc_alloc_handle(struct net_device *dev) return 0; } -void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) +void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, + unsigned int len) { const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; int drops; - if (n == 0) + if (n == 0 && len == 0) return; drops = max_t(int, n, 0); rcu_read_lock(); @@ -774,11 +775,12 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; + sch->qstats.backlog -= len; __qdisc_qstats_drop(sch, drops); } rcu_read_unlock(); } -EXPORT_SYMBOL(qdisc_tree_decrease_qlen); +EXPORT_SYMBOL(qdisc_tree_reduce_backlog); static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, @@ -1841,7 +1843,7 @@ reclassify: return err; } - return -1; + return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= MAX_REC_LOOP)) { @@ -1852,6 +1854,7 @@ reset: } tp = old_tp; + protocol = tc_skb_protocol(skb); goto reclassify; #endif } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c538d9e4a8f6..baafddf229ce 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1624,13 +1624,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new->reshape_fail = cbq_reshape_fail; #endif } - sch_tree_lock(sch); - *old = cl->q; - cl->q = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->q); return 0; } @@ -1914,7 +1909,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - unsigned int qlen; + unsigned int qlen, backlog; if (cl->filters || cl->children || cl == &q->link) return -EBUSY; @@ -1922,8 +1917,9 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); qlen = cl->q->q.qlen; + backlog = cl->q->qstats.backlog; qdisc_reset(cl->q); - qdisc_tree_decrease_qlen(cl->q, qlen); + qdisc_tree_reduce_backlog(cl->q, qlen, backlog); if (cl->next_alive) cbq_deactivate_class(cl); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 5ffb8b8337c7..0a08c860eee4 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -128,8 +128,8 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) choke_zap_tail_holes(q); qdisc_qstats_backlog_dec(sch, skb); + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_drop(skb, sch); - qdisc_tree_decrease_qlen(sch, 1); --sch->q.qlen; } @@ -456,6 +456,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) old = q->tab; if (old) { unsigned int oqlen = sch->q.qlen, tail = 0; + unsigned dropped = 0; while (q->head != q->tail) { struct sk_buff *skb = q->tab[q->head]; @@ -467,11 +468,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ntab[tail++] = skb; continue; } + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; q->tail = tail; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 535007d5f0b5..9b7e2980ee5c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -79,12 +79,13 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); - /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. */ if (q->stats.drop_count && sch->q.qlen) { - qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; + q->stats.drop_len = 0; } if (skb) qdisc_bstats_update(sch, skb); @@ -116,7 +117,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) { struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen; + unsigned int qlen, dropped = 0; int err; if (!opt) @@ -156,10 +157,11 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index f26bdea875c1..a63e879e8975 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -53,9 +53,10 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) static void drr_purge_queue(struct drr_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { @@ -226,11 +227,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - drr_purge_queue(cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } @@ -403,6 +400,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) if (len <= cl->deficit) { cl->deficit -= len; skb = qdisc_dequeue_peeked(cl->qdisc); + if (unlikely(skb == NULL)) + goto out; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index f357f34d02d2..d0dff0cd8186 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -73,13 +73,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - *old = p->q; - p->q = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &p->q); return 0; } @@ -264,6 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -286,6 +281,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) return NULL; qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; index = skb->tc_index & (p->indices - 1); @@ -401,6 +397,7 @@ static void dsmark_reset(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); qdisc_reset(p->q); + sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 109b2322778f..3c6a47d66a04 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -662,6 +662,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; int err, drop_count = 0; + unsigned drop_len = 0; u32 fq_log; if (!opt) @@ -736,10 +737,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!skb) break; + drop_len += qdisc_pkt_len(skb); kfree_skb(skb); drop_count++; } - qdisc_tree_decrease_qlen(sch, drop_count); + qdisc_tree_reduce_backlog(sch, drop_count, drop_len); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4c834e93dafb..d3fc8f9dd3d4 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -175,7 +175,7 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - unsigned int idx; + unsigned int idx, prev_backlog; struct fq_codel_flow *flow; int uninitialized_var(ret); @@ -203,6 +203,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; + prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet * from this flow. @@ -211,7 +212,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } @@ -241,6 +242,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) struct fq_codel_flow *flow; struct list_head *head; u32 prev_drop_count, prev_ecn_mark; + unsigned int prev_backlog; begin: head = &q->new_flows; @@ -259,6 +261,7 @@ begin: prev_drop_count = q->cstats.drop_count; prev_ecn_mark = q->cstats.ecn_mark; + prev_backlog = sch->qstats.backlog; skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, dequeue); @@ -276,12 +279,14 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. */ if (q->cstats.drop_count && sch->q.qlen) { - qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, + q->cstats.drop_len); q->cstats.drop_count = 0; + q->cstats.drop_len = 0; } return skb; } @@ -372,11 +377,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_codel_dequeue(sch); + q->cstats.drop_len += qdisc_pkt_len(skb); kfree_skb(skb); q->cstats.drop_count++; } - qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; + q->cstats.drop_len = 0; sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b7ebe2c87586..d783d7cc3348 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -895,9 +895,10 @@ static void hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static void @@ -1215,11 +1216,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; } - sch_tree_lock(sch); - hfsc_purge_queue(sch, cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 86b04e31e60b..13d6f83ec491 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -382,6 +382,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; + unsigned int prev_backlog; idx = hhf_classify(skb, sch); @@ -409,6 +410,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; + prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. @@ -417,7 +419,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } @@ -527,7 +529,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen; + unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -577,12 +579,14 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) } qlen = sch->q.qlen; + prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); kfree_skb(skb); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, + prev_backlog - sch->qstats.backlog); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 15ccd7f8fb2a..87b02ed3d5f2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -600,6 +600,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) htb_activate(q, cl); } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -889,6 +890,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) ok: qdisc_bstats_update(sch, skb); qdisc_unthrottled(sch); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -955,6 +957,7 @@ static unsigned int htb_drop(struct Qdisc *sch) unsigned int len; if (cl->un.leaf.q->ops->drop && (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + sch->qstats.backlog -= len; sch->q.qlen--; if (!cl->un.leaf.q->q.qlen) htb_deactivate(q, cl); @@ -984,12 +987,12 @@ static void htb_reset(struct Qdisc *sch) } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; - } } qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; + sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); for (i = 0; i < TC_HTB_NUMPRIO; i++) @@ -1163,14 +1166,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, cl->common.classid)) == NULL) return -ENOBUFS; - sch_tree_lock(sch); - *old = cl->un.leaf.q; - cl->un.leaf.q = new; - if (*old != NULL) { - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - } - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->un.leaf.q); return 0; } @@ -1272,7 +1268,6 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - unsigned int qlen; struct Qdisc *new_q = NULL; int last_child = 0; @@ -1292,9 +1287,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); if (!cl->level) { - qlen = cl->un.leaf.q->q.qlen; + unsigned int qlen = cl->un.leaf.q->q.qlen; + unsigned int backlog = cl->un.leaf.q->qstats.backlog; + qdisc_reset(cl->un.leaf.q); - qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen); + qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog); } /* delete from hash and active; remainder in destroy_class */ @@ -1428,10 +1425,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_lock(sch); if (parent && !parent->level) { unsigned int qlen = parent->un.leaf.q->q.qlen; + unsigned int backlog = parent->un.leaf.q->qstats.backlog; /* turn parent into inner node */ qdisc_reset(parent->un.leaf.q); - qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen); + qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog); qdisc_destroy(parent->un.leaf.q); if (parent->prio_activity) htb_deactivate(q, parent); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index ad70ecf57ce7..f9947d1f4952 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO}; unsigned int ntx; if (priv->qdiscs) { @@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) - dev->netdev_ops->ndo_setup_tc(dev, 0); + dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); else netdev_set_num_tc(dev, 0); } @@ -140,8 +141,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { + struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, + .tc = qopt->num_tc}; + priv->hw_owned = 1; - err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) goto err; } else { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 4e904ca0af9d..bcdd54bb101c 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -218,7 +218,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; - qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_tree_reduce_backlog(child, child->q.qlen, + child->qstats.backlog); qdisc_destroy(child); } } @@ -238,8 +239,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) q->queues[i] = child; if (old != &noop_qdisc) { - qdisc_tree_decrease_qlen(old, - old->q.qlen); + qdisc_tree_reduce_backlog(old, + old->q.qlen, + old->qstats.backlog); qdisc_destroy(old); } sch_tree_unlock(sch); @@ -303,13 +305,7 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->queues[band]; - q->queues[band] = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5abd1d9de989..9640bb39a5d2 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -598,7 +598,8 @@ deliver: if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, + qdisc_pkt_len(skb)); } } goto tfifo_dequeue; @@ -1037,15 +1038,7 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, { struct netem_sched_data *q = qdisc_priv(sch); - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - if (*old) { - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - } - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b783a446d884..71ae3b9629f9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -183,7 +183,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen; + unsigned int qlen, dropped = 0; int err; if (!opt) @@ -232,10 +232,11 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); + dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index ba6487f2741f..fee1b15506b2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -191,7 +191,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; if (child != &noop_qdisc) { - qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); qdisc_destroy(child); } } @@ -210,8 +210,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) q->queues[i] = child; if (old != &noop_qdisc) { - qdisc_tree_decrease_qlen(old, - old->q.qlen); + qdisc_tree_reduce_backlog(old, + old->q.qlen, + old->qstats.backlog); qdisc_destroy(old); } sch_tree_unlock(sch); @@ -268,13 +269,7 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->queues[band]; - q->queues[band] = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3dc3a6e56052..8d2d8d953432 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -220,9 +220,10 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) static void qfq_purge_queue(struct qfq_class *cl) { unsigned int len = cl->qdisc->q.qlen; + unsigned int backlog = cl->qdisc->qstats.backlog; qdisc_reset(cl->qdisc); - qdisc_tree_decrease_qlen(cl->qdisc, len); + qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); } static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { @@ -617,11 +618,7 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, new = &noop_qdisc; } - sch_tree_lock(sch); - qfq_purge_queue(cl); - *old = cl->qdisc; - cl->qdisc = new; - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6c0534cc7758..8c0508c0e287 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -210,7 +210,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) q->flags = ctl->flags; q->limit = ctl->limit; if (child) { - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; } @@ -313,12 +314,7 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 5bbb6332ec57..c69611640fa5 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -510,7 +510,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; @@ -606,12 +607,7 @@ static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 3abab534eb5c..498f0a2cb47f 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -346,7 +346,7 @@ static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); - unsigned int hash; + unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; int uninitialized_var(ret); @@ -461,7 +461,7 @@ enqueue: return NET_XMIT_SUCCESS; qlen = slot->qlen; - sfq_drop(sch); + dropped = sfq_drop(sch); /* Return Congestion Notification only if we dropped a packet * from this flow. */ @@ -469,7 +469,7 @@ enqueue: return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_decrease_qlen(sch, 1); + qdisc_tree_reduce_backlog(sch, 1, dropped); return NET_XMIT_SUCCESS; } @@ -537,6 +537,7 @@ static void sfq_rehash(struct Qdisc *sch) struct sfq_slot *slot; struct sk_buff_head list; int dropped = 0; + unsigned int drop_len = 0; __skb_queue_head_init(&list); @@ -565,6 +566,7 @@ static void sfq_rehash(struct Qdisc *sch) if (x >= SFQ_MAX_FLOWS) { drop: qdisc_qstats_backlog_dec(sch, skb); + drop_len += qdisc_pkt_len(skb); kfree_skb(skb); dropped++; continue; @@ -594,7 +596,7 @@ drop: } } sch->q.qlen -= dropped; - qdisc_tree_decrease_qlen(sch, dropped); + qdisc_tree_reduce_backlog(sch, dropped, drop_len); } static void sfq_perturbation(unsigned long arg) @@ -618,7 +620,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; - unsigned int qlen; + unsigned int qlen, dropped = 0; struct red_parms *p = NULL; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) @@ -667,8 +669,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) qlen = sch->q.qlen; while (sch->q.qlen > q->limit) - sfq_drop(sch); - qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + dropped += sfq_drop(sch); + qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); del_timer(&q->perturb_timer); if (q->perturb_period) { diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index a4afde14e865..c2fbde742f37 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -160,6 +160,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); + unsigned int len = 0, prev_len = qdisc_pkt_len(skb); int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); @@ -172,6 +173,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) nskb = segs->next; segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; + len += segs->len; ret = qdisc_enqueue(segs, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) @@ -183,7 +185,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) } sch->q.qlen += nb; if (nb > 1) - qdisc_tree_decrease_qlen(sch, 1 - nb); + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); consume_skb(skb); return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } @@ -399,7 +401,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); if (child) { - qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, + q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; } @@ -502,13 +505,7 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (new == NULL) new = &noop_qdisc; - sch_tree_lock(sch); - *old = q->qdisc; - q->qdisc = new; - qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); - qdisc_reset(*old); - sch_tree_unlock(sch); - + *old = qdisc_replace(sch, new, &q->qdisc); return 0; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a3380917f197..3aa43073e0b9 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) return msg; } -void sctp_datamsg_free(struct sctp_datamsg *msg) -{ - struct sctp_chunk *chunk; - - /* This doesn't have to be a _safe vairant because - * sctp_chunk_free() only drops the refs. - */ - list_for_each_entry(chunk, &msg->chunks, frag_list) - sctp_chunk_free(chunk); - - sctp_datamsg_put(msg); -} - /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 52838eaa1582..2522a6175291 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -333,7 +333,7 @@ struct sctp_association *sctp_endpoint_lookup_assoc( if (!ep->base.bind_addr.port) goto out; t = sctp_epaddr_lookup_transport(ep, paddr); - if (!t || t->asoc->temp) + if (!t) goto out; *transport = t; diff --git a/net/sctp/input.c b/net/sctp/input.c index d9a6e66c5c8a..21a2d6b7abaf 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -784,6 +784,7 @@ hit: /* rhashtable for transport */ struct sctp_hash_cmp_arg { + const struct sctp_endpoint *ep; const union sctp_addr *laddr; const union sctp_addr *paddr; const struct net *net; @@ -797,15 +798,20 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, struct sctp_association *asoc = t->asoc; const struct net *net = x->net; - if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) - return 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return 1; if (!net_eq(sock_net(asoc->base.sk), net)) return 1; - if (!sctp_bind_addr_match(&asoc->base.bind_addr, - x->laddr, sctp_sk(asoc->base.sk))) - return 1; + if (x->ep) { + if (x->ep != asoc->ep) + return 1; + } else { + if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) + return 1; + if (!sctp_bind_addr_match(&asoc->base.bind_addr, + x->laddr, sctp_sk(asoc->base.sk))) + return 1; + } return 0; } @@ -832,9 +838,11 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed) const struct sctp_hash_cmp_arg *x = data; const union sctp_addr *paddr = x->paddr; const struct net *net = x->net; - u16 lport = x->laddr->v4.sin_port; + u16 lport; u32 addr; + lport = x->ep ? htons(x->ep->base.bind_addr.port) : + x->laddr->v4.sin_port; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else @@ -864,12 +872,12 @@ void sctp_transport_hashtable_destroy(void) void sctp_hash_transport(struct sctp_transport *t) { - struct sctp_sockaddr_entry *addr; struct sctp_hash_cmp_arg arg; - addr = list_entry(t->asoc->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list); - arg.laddr = &addr->a; + if (t->asoc->temp) + return; + + arg.ep = t->asoc->ep; arg.paddr = &t->ipaddr; arg.net = sock_net(t->asoc->base.sk); @@ -881,6 +889,9 @@ reinsert: void sctp_unhash_transport(struct sctp_transport *t) { + if (t->asoc->temp) + return; + rhashtable_remove_fast(&sctp_transport_hashtable, &t->node, sctp_hash_params); } @@ -891,6 +902,7 @@ struct sctp_transport *sctp_addrs_lookup_transport( const union sctp_addr *paddr) { struct sctp_hash_cmp_arg arg = { + .ep = NULL, .laddr = laddr, .paddr = paddr, .net = net, @@ -904,13 +916,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { - struct sctp_sockaddr_entry *addr; struct net *net = sock_net(ep->base.sk); + struct sctp_hash_cmp_arg arg = { + .ep = ep, + .paddr = paddr, + .net = net, + }; - addr = list_entry(ep->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list); - - return sctp_addrs_lookup_transport(net, &addr->a, paddr); + return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, + sctp_hash_params); } /* Look up an association. */ @@ -921,15 +935,20 @@ static struct sctp_association *__sctp_lookup_association( struct sctp_transport **pt) { struct sctp_transport *t; + struct sctp_association *asoc = NULL; t = sctp_addrs_lookup_transport(net, local, peer); - if (!t || t->dead || t->asoc->temp) - return NULL; + if (!t || !sctp_transport_hold(t)) + goto out; - sctp_association_hold(t->asoc); + asoc = t->asoc; + sctp_association_hold(asoc); *pt = t; - return t->asoc; + sctp_transport_put(t); + +out: + return asoc; } /* Look up an association. protected by RCU read lock */ diff --git a/net/sctp/proc.c b/net/sctp/proc.c index dfa7eeccb537..cfc3c7101a38 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -161,12 +161,9 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa struct sctp_af *af; primary = &assoc->peer.primary_addr; - rcu_read_lock(); list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; - if (transport->dead) - continue; af = sctp_get_af_specific(addr->sa.sa_family); if (af->cmp_addr(addr, primary)) { @@ -174,7 +171,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa } af->seq_dump_addr(seq, addr); } - rcu_read_unlock(); } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) @@ -310,7 +306,7 @@ static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq) static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq, loff_t pos) { - void *obj; + void *obj = SEQ_START_TOKEN; while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj)) pos--; @@ -347,7 +343,7 @@ static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) if (err) return ERR_PTR(err); - return *pos ? sctp_transport_get_idx(seq, *pos) : SEQ_START_TOKEN; + return sctp_transport_get_idx(seq, *pos); } static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) @@ -380,6 +376,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) } transport = (struct sctp_transport *)v; + if (!sctp_transport_hold(transport)) + return 0; assoc = transport->asoc; epb = &assoc->base; sk = epb->sk; @@ -412,6 +410,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) sk->sk_rcvbuf); seq_printf(seq, "\n"); + sctp_transport_put(transport); + return 0; } @@ -462,7 +462,7 @@ static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) if (err) return ERR_PTR(err); - return *pos ? sctp_transport_get_idx(seq, *pos) : SEQ_START_TOKEN; + return sctp_transport_get_idx(seq, *pos); } static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -489,12 +489,12 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) } tsp = (struct sctp_transport *)v; + if (!sctp_transport_hold(tsp)) + return 0; assoc = tsp->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { - if (tsp->dead) - continue; /* * The remote address (ADDR) */ @@ -544,6 +544,8 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } + sctp_transport_put(tsp); + return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ab0d538a74ed..1099e99a53c4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -60,6 +60,8 @@ #include <net/inet_common.h> #include <net/inet_ecn.h> +#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) + /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; @@ -1355,6 +1357,8 @@ static __init int sctp_init(void) unsigned long limit; int max_share; int order; + int num_entries; + int max_entry_order; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); @@ -1407,14 +1411,24 @@ static __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. + * Though not identical. Start by getting a goal size */ if (totalram_pages >= (128 * 1024)) goal = totalram_pages >> (22 - PAGE_SHIFT); else goal = totalram_pages >> (24 - PAGE_SHIFT); - for (order = 0; (1UL << order) < goal; order++) - ; + /* Then compute the page order for said goal */ + order = get_order(goal); + + /* Now compute the required page order for the maximum sized table we + * want to create + */ + max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * + sizeof(struct sctp_bind_hashbucket)); + + /* Limit the page order by that maximum hash table size */ + order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; @@ -1430,20 +1444,35 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } - /* Allocate and initialize the SCTP port hash table. */ + /* Allocate and initialize the SCTP port hash table. + * Note that order is initalized to start at the max sized + * table we want to support. If we can't get that many pages + * reduce the order and try again + */ do { - sctp_port_hashsize = (1UL << order) * PAGE_SIZE / - sizeof(struct sctp_bind_hashbucket); - if ((sctp_port_hashsize > (64 * 1024)) && order > 0) - continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } + + /* Now compute the number of entries that will fit in the + * port hash space we allocated + */ + num_entries = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + + /* And finish by rounding it down to the nearest power of two + * this wastes some memory of course, but its needed because + * the hash function operates based on the assumption that + * that the number of entries is a power of two + */ + sctp_port_hashsize = rounddown_pow_of_two(num_entries); + for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); @@ -1452,7 +1481,8 @@ static __init int sctp_init(void) if (sctp_transport_hashtable_init()) goto err_thash_alloc; - pr_info("Hash tables configured (bind %d)\n", sctp_port_hashsize); + pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, + num_entries); sctp_sysctl_register(); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 2e21384697c2..b5327bb77458 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -259,12 +259,6 @@ void sctp_generate_t3_rtx_event(unsigned long peer) goto out_unlock; } - /* Is this transport really dead and just waiting around for - * the timer to let go of the reference? - */ - if (transport->dead) - goto out_unlock; - /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), @@ -380,12 +374,6 @@ void sctp_generate_heartbeat_event(unsigned long data) goto out_unlock; } - /* Is this structure just waiting around for us to actually - * get destroyed? - */ - if (transport->dead) - goto out_unlock; - error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), asoc->state, asoc->ep, asoc, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9bb80ec4c08f..b89501e5c1a1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5538,6 +5538,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; + int i; if (!ep->auth_enable) return -EACCES; @@ -5555,8 +5556,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; - if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len)) - return -EFAULT; + for (i = 0; i < num_idents; i++) { + __u16 hmacid = ntohs(hmacs->hmac_ids[i]); + + if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) + return -EFAULT; + } return 0; } @@ -6101,9 +6106,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, return retval; } -static void sctp_hash(struct sock *sk) +static int sctp_hash(struct sock *sk) { /* STUB */ + return 0; } static void sctp_unhash(struct sock *sk) @@ -6636,6 +6642,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_SACK_IMMEDIATELY | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -6659,6 +6666,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_SACK_IMMEDIATELY | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index aab9e3f29755..a431c14044a4 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -132,8 +132,6 @@ fail: */ void sctp_transport_free(struct sctp_transport *transport) { - transport->dead = 1; - /* Try to delete the heartbeat timer. */ if (del_timer(&transport->hb_timer)) sctp_transport_put(transport); @@ -169,7 +167,7 @@ static void sctp_transport_destroy_rcu(struct rcu_head *head) */ static void sctp_transport_destroy(struct sctp_transport *transport) { - if (unlikely(!transport->dead)) { + if (unlikely(atomic_read(&transport->refcnt))) { WARN(1, "Attempt to destroy undead transport %p!\n", transport); return; } @@ -296,9 +294,9 @@ void sctp_transport_route(struct sctp_transport *transport, } /* Hold a reference to a transport. */ -void sctp_transport_hold(struct sctp_transport *transport) +int sctp_transport_hold(struct sctp_transport *transport) { - atomic_inc(&transport->refcnt); + return atomic_add_unless(&transport->refcnt, 1, 0); } /* Release a reference to a transport and clean up diff --git a/net/socket.c b/net/socket.c index 91c2de6f5020..c044d1e8508c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -294,7 +294,7 @@ static int init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 59eeed43eda2..f0c6a8c78a56 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -326,6 +326,9 @@ int gssp_accept_sec_context_upcall(struct net *net, if (data->found_creds && client_name.data != NULL) { char *c; + data->creds.cr_raw_principal = kstrndup(client_name.data, + client_name.len, GFP_KERNEL); + data->creds.cr_principal = kstrndup(client_name.data, client_name.len, GFP_KERNEL); if (data->creds.cr_principal) { diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5e4f815c2b34..2b32fd602669 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (count == 0) return 0; - mutex_lock(&inode->i_mutex); /* protect against multiple concurrent + inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: spin_lock(&queue_lock); @@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } @@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (err == -EAGAIN) goto again; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err ? err : count; } @@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, if (!cd->cache_parse) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = cache_downcall(mapping, buf, count, cd); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return ret; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 23608eb0ded2..b7f21044f4d8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); + break; default: dprintk("RPC: %s: address family not supported\n", __func__); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d34558..31789ef3e614 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode) int need_release; LIST_HEAD(free_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock(&pipe->lock); need_release = pipe->nreaders != 0 || pipe->nwriters != 0; pipe->nreaders = 0; @@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode) cancel_delayed_work_sync(&pipe->queue_timeout); rpc_inode_setowner(inode, NULL); RPC_I(inode)->pipe = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static struct inode * @@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) int first_open; int res = -ENXIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) pipe->nwriters++; res = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) struct rpc_pipe_msg *msg; int last_close; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (last_close && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) struct rpc_pipe_msg *msg; int res = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { res = -EPIPE; @@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) pipe->ops->destroy_msg(msg); } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of struct inode *inode = file_inode(filp); int res; - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = -EPIPE; if (RPC_I(inode)->pipe != NULL) res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) poll_wait(filp, &rpci->waitq, wait); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (rpci->pipe == NULL) mask |= POLLERR | POLLHUP; else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) mask |= POLLIN | POLLRDNORM; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return mask; } @@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -EPIPE; } spin_lock(&pipe->lock); @@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) len += msg->len - msg->copied; } spin_unlock(&pipe->lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(len, (int __user *)arg); default: return -EINVAL; @@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent, { struct inode *dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dir, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); } static int rpc_populate(struct dentry *parent, @@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent, struct dentry *dentry; int i, err; - mutex_lock(&dir->i_mutex); + inode_lock(dir); for (i = start; i < eof; i++) { dentry = __rpc_lookup_create_exclusive(parent, files[i].name); err = PTR_ERR(dentry); @@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent, if (err != 0) goto out_bad; } - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return 0; out_bad: __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; @@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, struct inode *dir = d_inode(parent); int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, goto err_rmdir; } out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; err_rmdir: __rpc_rmdir(dir, dentry); @@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~S_IWUGO; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (err) goto out_err; out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out_err: dentry = ERR_PTR(err); @@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a6cbb2104667..7422f28818b2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -10,11 +10,13 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <net/sock.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprt.h> #include <linux/module.h> +#include <linux/netdevice.h> #include <trace/events/sunrpc.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -938,6 +940,49 @@ static void svc_age_temp_xprts(unsigned long closure) mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } +/* Close temporary transports whose xpt_local matches server_addr immediately + * instead of waiting for them to be picked up by the timer. + * + * This is meant to be called from a notifier_block that runs when an ip + * address is deleted. + */ +void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) +{ + struct svc_xprt *xprt; + struct svc_sock *svsk; + struct socket *sock; + struct list_head *le, *next; + LIST_HEAD(to_be_closed); + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + spin_lock_bh(&serv->sv_lock); + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + if (rpc_cmp_addr(server_addr, (struct sockaddr *) + &xprt->xpt_local)) { + dprintk("svc_age_temp_xprts_now: found %p\n", xprt); + list_move(le, &to_be_closed); + } + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_closed)) { + le = to_be_closed.next; + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); + svc_close_xprt(xprt); + } +} +EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); + static void call_xpt_users(struct svc_xprt *xprt) { struct svc_xpt_user *u; diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 79c0f3459b5c..69841db1f533 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -55,6 +55,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; + init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; return aops->accept(rqstp, authp); @@ -63,6 +64,7 @@ EXPORT_SYMBOL_GPL(svc_authenticate); int svc_set_client(struct svc_rqst *rqstp) { + rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 621ca7b4a155..dfacdc95b3f5 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -728,10 +728,6 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) struct kvec *resv = &rqstp->rq_res.head[0]; struct svc_cred *cred = &rqstp->rq_cred; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if (argv->iov_len < 3*4) return SVC_GARBAGE; @@ -794,10 +790,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) u32 slen, i; int len = argv->iov_len; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if ((len -= 3*4) < 0) return SVC_GARBAGE; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2e98f4a243e5..37edea6fa92d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } +EXPORT_SYMBOL_GPL(xprt_put); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 33f99d3004f2..dc9f3b513a05 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o physical_ops.o \ - svc_rdma.o svc_rdma_transport.o \ + svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcb44f69e53..cc1251d07297 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -15,7 +15,7 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define RPCRDMA_BACKCHANNEL_DEBUG +#undef RPCRDMA_BACKCHANNEL_DEBUG static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) @@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, size_t size; req = rpcrdma_create_req(r_xprt); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->rl_backchannel = true; size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); @@ -84,9 +84,7 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - unsigned long flags; int rc = 0; while (count--) { @@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, break; } - spin_lock_irqsave(&buffers->rb_lock, flags); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + rpcrdma_recv_buffer_put(rep); } return rc; @@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) __func__); goto out_free; } + dprintk("RPC: %s: new rqst %p\n", __func__, rqst); rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); @@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpclen = rqst->rq_svec[0].iov_len; +#ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); pr_info("RPC: %s: RPC/RDMA: %*ph\n", __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); pr_info("RPC: %s: RPC: %*ph\n", __func__, (int)rpclen, rqst->rq_svec[0].iov_base); +#endif req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; @@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; + dprintk("RPC: %s: freeing rqst %p (req %p)\n", + __func__, rqst, rpcr_to_rdmar(rqst)); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: using rqst %p\n", __func__, rqst); -#endif + dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * direction reply. */ req = rpcr_to_rdmar(rqst); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: attaching rep %p to req %p\n", + dprintk("RPC: %s: attaching rep %p to req %p\n", __func__, rep, req); -#endif req->rl_reply = rep; /* Defeat the retransmit detection logic in send_request */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1e8dafbd507..c14f3a4bff68 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -179,6 +179,69 @@ out_maperr: return rc; } +static void +__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + int nsegs = seg->mr_nsegs; + + seg->rl_mw = NULL; + + while (nsegs--) + rpcrdma_unmap_one(device, seg++); + + rpcrdma_put_mw(r_xprt, mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_mw *mw; + LIST_HEAD(unmap_list); + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * ib_unmap_fmr() is slow, so use a single call instead + * of one call per mapped MR. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + list_add(&mw->r.fmr.fmr->list, &unmap_list); + + i += seg->mr_nsegs; + } + rc = ib_unmap_fmr(&unmap_list); + if (rc) + pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __fmr_dma_unmap(r_xprt, seg); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Use the ib_unmap_fmr() verb to prevent further remote * access via RDMA READ or RDMA WRITE. */ @@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 88cf9e7269c2..e16567389e28 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -190,12 +190,11 @@ static int frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; int depth, delta; ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); + ia->ri_device->attrs.max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", __func__, ia->ri_max_frmr_depth); @@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; + if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) { + cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * @@ -245,12 +244,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } -/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs + * to be reset. + * + * WARNING: Only wr_id and status are reliable at this point + */ static void -frwr_sendcompletion(struct ib_wc *wc) +__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r) { - struct rpcrdma_mw *r; - if (likely(wc->status == IB_WC_SUCCESS)) return; @@ -261,9 +262,23 @@ frwr_sendcompletion(struct ib_wc *wc) else pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", __func__, r, ib_wc_status_msg(wc->status), wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; } +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + struct rpcrdma_frmr *f = &r->r.frmr; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + __frwr_sendcompletion_flush(wc, r); + + if (f->fr_waiter) + complete(&f->fr_linv_done); +} + static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { @@ -319,7 +334,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; - struct ib_reg_wr reg_wr; + struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n, dma_nents; u8 key; @@ -335,7 +350,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->r.frmr; frmr->fr_state = FRMR_IS_VALID; + frmr->fr_waiter = false; mr = frmr->fr_mr; + reg_wr = &frmr->fr_regwr; if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; @@ -381,19 +398,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = (uintptr_t)mw; - reg_wr.wr.num_sge = 0; - reg_wr.wr.send_flags = 0; - reg_wr.mr = mr; - reg_wr.key = mr->rkey; - reg_wr.access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; + reg_wr->wr.next = NULL; + reg_wr->wr.opcode = IB_WR_REG_MR; + reg_wr->wr.wr_id = (uintptr_t)mw; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = 0; + reg_wr->mr = mr; + reg_wr->key = mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, ®_wr.wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -413,6 +430,116 @@ out_senderr: return rc; } +static struct ib_send_wr * +__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + struct ib_send_wr *invalidate_wr; + + f->fr_waiter = false; + f->fr_state = FRMR_IS_INVALID; + invalidate_wr = &f->fr_invwr; + + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (unsigned long)(void *)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; + + return invalidate_wr; +} + +static void +__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int rc) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + + seg->rl_mw = NULL; + + ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); + + if (!rc) + rpcrdma_put_mw(r_xprt, mw); + else + __frwr_queue_recovery(mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_frmr *f; + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * Chain the LOCAL_INV Work Requests and post them with + * a single ib_post_send() call. + */ + invalidate_wrs = pos = prev = NULL; + seg = NULL; + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + pos = __frwr_prepare_linv_wr(seg); + + if (!invalidate_wrs) + invalidate_wrs = pos; + else + prev->next = pos; + prev = pos; + + i += seg->mr_nsegs; + } + f = &seg->rl_mw->r.frmr; + + /* Strong send queue ordering guarantees that when the + * last WR in the chain completes, all WRs in the chain + * are complete. + */ + f->fr_invwr.send_flags = IB_SEND_SIGNALED; + f->fr_waiter = true; + init_completion(&f->fr_linv_done); + INIT_CQCOUNT(&r_xprt->rx_ep); + + /* Transport disconnect drains the receive CQ before it + * replaces the QP. The RPC reply handler won't call us + * unless ri_id->qp is a valid pointer. + */ + rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); + if (rc) + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + + wait_for_completion(&f->fr_linv_done); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __frwr_dma_unmap(r_xprt, seg, rc); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Post a LOCAL_INV Work Request to prevent further remote access * via RDMA READ or RDMA WRITE. */ @@ -423,23 +550,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_send_wr invalidate_wr, *bad_wr; + struct ib_send_wr *invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; dprintk("RPC: %s: FRMR %p\n", __func__, mw); seg1->rl_mw = NULL; frmr->fr_state = FRMR_IS_INVALID; + invalidate_wr = &mw->r.frmr.fr_invwr; - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (uintptr_t)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) goto out_err; @@ -471,6 +599,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 617b76f22154..dbb302ecf590 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) return 1; } +/* DMA unmap all memory regions that were mapped for "req". + */ +static void +physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + unsigned int i; + + for (i = 0; req->rl_nchunks; --req->rl_nchunks) + rpcrdma_unmap_one(device, &req->rl_segments[i++]); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, + .ro_unmap_sync = physical_op_unmap_sync, .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c10d9699441c..0f28f2d743ed 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (req->rl_reply) goto out_duplicate; + /* Sanity checking has passed. We are now committed + * to complete this transaction. + */ + list_del_init(&rqst->rq_list); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", __func__, rep, req, rqst, @@ -888,12 +893,23 @@ badheader: break; } + /* Invalidate and flush the data payloads before waking the + * waiting application. This guarantees the memory region is + * properly fenced from the server before the application + * accesses the data. It also ensures proper send flow + * control: waking the next RPC waits until this RPC has + * relinquished all its Send Queue entries. + */ + if (req->rl_nchunks) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); + credits = be32_to_cpu(headerp->rm_credit); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_max_requests) credits = r_xprt->rx_buf.rb_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 1b7051bdbdc8..c846ca9f1eba 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD; static unsigned int min_ord = 1; static unsigned int max_ord = 4096; unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; @@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -/* Temporary NFS request map and context caches */ -struct kmem_cache *svc_rdma_map_cachep; -struct kmem_cache *svc_rdma_ctxt_cachep; - struct workqueue_struct *svc_rdma_wq; /* @@ -243,17 +240,16 @@ void svc_rdma_cleanup(void) svc_unreg_xprt_class(&svc_rdma_bc_class); #endif svc_unreg_xprt_class(&svc_rdma_class); - kmem_cache_destroy(svc_rdma_map_cachep); - kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) { dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %d\n", svcrdma_max_requests); - dprintk("\tsq_depth : %d\n", + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tsq_depth : %u\n", svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); @@ -264,39 +260,10 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); - /* Create the temporary map cache */ - svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", - sizeof(struct svc_rdma_req_map), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_map_cachep) { - printk(KERN_INFO "Could not allocate map cache.\n"); - goto err0; - } - - /* Create the temporary context cache */ - svc_rdma_ctxt_cachep = - kmem_cache_create("svc_rdma_ctxt_cache", - sizeof(struct svc_rdma_op_ctxt), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_ctxt_cachep) { - printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); - goto err1; - } - /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); #if defined(CONFIG_SUNRPC_BACKCHANNEL) svc_reg_xprt_class(&svc_rdma_bc_class); #endif return 0; - err1: - kmem_cache_destroy(svc_rdma_map_cachep); - err0: - unregister_sysctl_table(svcrdma_table_header); - destroy_workqueue(svc_rdma_wq); - return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c new file mode 100644 index 000000000000..65a7c232a345 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA (server-side). + */ + +#include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +#undef SVCRDMA_BACKCHANNEL_DEBUG + +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct kvec *dst, *src = &rcvbuf->head[0]; + struct rpc_rqst *req; + unsigned long cwnd; + u32 credits; + size_t len; + __be32 xid; + __be32 *p; + int ret; + + p = (__be32 *)src->iov_base; + len = src->iov_len; + xid = rmsgp->rm_xid; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: xid=%08x, length=%zu\n", + __func__, be32_to_cpu(xid), len); + pr_info("%s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + pr_info("%s: RPC: %*ph\n", + __func__, (int)len, p); +#endif + + ret = -EAGAIN; + if (src->iov_len < 24) + goto out_shortreply; + + spin_lock_bh(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, xid); + if (!req) + goto out_notfound; + + dst = &req->rq_private_buf.head[0]; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + if (dst->iov_len < len) + goto out_unlock; + memcpy(dst->iov_base, p, len); + + credits = be32_to_cpu(rmsgp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; + + cwnd = xprt->cwnd; + xprt->cwnd = credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(req->rq_task); + + ret = 0; + xprt_complete_rqst(req->rq_task, rcvbuf->len); + rcvbuf->len = 0; + +out_unlock: + spin_unlock_bh(&xprt->transport_lock); +out: + return ret; + +out_shortreply: + dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", + xprt, src->iov_len); + goto out; + +out_notfound: + dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", + xprt, be32_to_cpu(xid)); + + goto out_unlock; +} + +/* Send a backwards direction RPC call. + * + * Caller holds the connection's mutex and has already marshaled + * the RPC/RDMA request. + * + * This is similar to svc_rdma_reply, but takes an rpc_rqst + * instead, does not support chunks, and avoids blocking memory + * allocation. + * + * XXX: There is still an opportunity to block in svc_rdma_send() + * if there are no SQ entries to post the Send. This may occur if + * the adapter has a small maximum SQ depth. + */ +static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, + struct rpc_rqst *rqst) +{ + struct xdr_buf *sndbuf = &rqst->rq_snd_buf; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + struct ib_send_wr send_wr; + int ret; + + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, sndbuf, vec); + if (ret) + goto out_err; + + /* Post a recv buffer to handle the reply for this request. */ + ret = svc_rdma_post_recv(rdma, GFP_NOIO); + if (ret) { + pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + ret = -ENOTCONN; + goto out_err; + } + + ctxt = svc_rdma_get_context(rdma); + ctxt->pages[0] = virt_to_page(rqst->rq_buffer); + ctxt->count = 1; + + ctxt->wr_op = IB_WR_SEND; + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; + ctxt->sge[0].length = sndbuf->len; + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, + sndbuf->len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { + ret = -EIO; + goto out_unmap; + } + atomic_inc(&rdma->sc_dma_used); + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) { + ret = -EIO; + goto out_unmap; + } + +out_err: + svc_rdma_put_req_map(rdma, vec); + dprintk("svcrdma: %s returns %d\n", __func__, ret); + return ret; + +out_unmap: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + goto out_err; +} + +/* Server-side transport endpoint wants a whole page for its send + * buffer. The client RPC code constructs the RPC header in this + * buffer before it invokes ->send_request. + * + * Returns NULL if there was a temporary allocation failure. + */ +static void * +xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + struct page *page; + + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + + /* Prevent an infinite loop: try to make this case work */ + if (size > PAGE_SIZE) + WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", + size); + + page = alloc_page(RPCRDMA_DEF_GFP); + if (!page) + return NULL; + + return page_address(page); +} + +static void +xprt_rdma_bc_free(void *buffer) +{ + /* No-op: ctxt and page have already been freed. */ +} + +static int +rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + int rc; + + /* Space in the send buffer for an RPC/RDMA header is reserved + * via xprt->tsh_size. + */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); +#endif + + rc = svc_rdma_bc_sendto(rdma, rqst); + if (rc) + goto drop_connection; + return rc; + +drop_connection: + dprintk("svcrdma: failed to send bc call\n"); + xprt_disconnect_done(xprt); + return -ENOTCONN; +} + +/* Send an RPC call on the passive end of a transport + * connection. + */ +static int +xprt_rdma_bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + int ret; + + dprintk("svcrdma: sending bc call with xid: %08x\n", + be32_to_cpu(rqst->rq_xid)); + + if (!mutex_trylock(&sxprt->xpt_mutex)) { + rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&sxprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); + } + + ret = -ENOTCONN; + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + ret = rpcrdma_bc_send_request(rdma, rqst); + + mutex_unlock(&sxprt->xpt_mutex); + + if (ret < 0) + return ret; + return 0; +} + +static void +xprt_rdma_bc_close(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); +} + +static void +xprt_rdma_bc_put(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static struct rpc_xprt_ops xprt_rdma_bc_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats +}; + +static const struct rpc_timeout xprt_rdma_bc_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/* It shouldn't matter if the number of backchannel session slots + * doesn't match the number of RPC/RDMA credits. That just means + * one or the other will have extra slots that aren't used. + */ +static struct rpc_xprt * +xprt_setup_rdma_bc(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new_xprt), + RPCRDMA_MAX_BC_REQUESTS, + RPCRDMA_MAX_BC_REQUESTS); + if (!xprt) { + dprintk("RPC: %s: couldn't allocate rpc_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_bc_timeout; + xprt_set_bound(xprt); + xprt_set_connected(xprt); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->prot = XPRT_TRANSPORT_BC_RDMA; + xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->ops = &xprt_rdma_bc_procs; + + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr); + xprt->resvport = 0; + + xprt->max_payload = xprt_rdma_max_inline_read; + + new_xprt = rpcx_to_rdmax(xprt); + new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs; + + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + + if (!try_module_get(THIS_MODULE)) + goto out_fail; + + /* Final put for backchannel xprt is in __svc_rdma_free */ + xprt_get(xprt); + return xprt; + +out_fail: + xprt_rdma_free_addresses(xprt); + args->bc_xprt->xpt_bc_xprt = NULL; + xprt_put(xprt); + xprt_free(xprt); + return ERR_PTR(-EINVAL); +} + +struct xprt_class xprt_rdma_bc = { + .list = LIST_HEAD_INIT(xprt_rdma_bc.list), + .name = "rdma backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_RDMA, + .setup = xprt_setup_rdma_bc, +}; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ff4f01e527ec..c8b8a8b4181e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; head->arg.page_len += len; + head->arg.len += len; if (!pg_off) head->count++; @@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, goto err; atomic_inc(&xprt->sc_dma_used); - /* The lkey here is either a local dma lkey or a dma_mr lkey */ - ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; ctxt->count++; @@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp, return ret; } +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +{ + __be32 *p = (__be32 *)rmsgp; + + if (!xprt->xpt_bc_xprt) + return false; + + if (rmsgp->rm_type != rdma_msg) + return false; + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != rmsgp->rm_xid) + return false; + /* call direction */ + if (p[8] == cpu_to_be32(RPC_CALL)) + return false; + + return true; +} + /* * Set up the rqstp thread context to point to the RQ buffer. If * necessary, pull additional data from the client with an RDMA_READ @@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto close_out; } + if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + &rqstp->rq_arg); + svc_rdma_put_context(ctxt, 0); + if (ret) + goto repost; + return ret; + } + /* Read read-list data. */ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { @@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) set_bit(XPT_CLOSE, &xprt->xpt_flags); defer: return 0; + +repost: + ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL); + if (ret) { + pr_err("svcrdma: could not post a receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma_xprt); + set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags); + ret = -ENOTCONN; + } + return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 969a1ab75fc3..df57f3ce6cd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -50,9 +50,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_no; u32 sge_bytes; @@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, if (xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: map_xdr: XDR buffer length error\n"); + pr_err("svcrdma: %s: XDR buffer length error\n", __func__); return -EIO; } @@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + dprintk("svcrdma: %s: sge_no %d page_no %d " "page_base %u page_len %u head_len %zu tail_len %zu\n", - sge_no, page_no, xdr->page_base, xdr->page_len, + __func__, sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); vec->count = sge_no; @@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge[sge_no].addr)) goto err; atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; + sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; sge_no++; @@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int ret; /* Post a recv buffer to handle another request. */ - ret = svc_rdma_post_recv(rdma); + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) { printk(KERN_INFO "svcrdma: could not post a receive buffer, err=%d." @@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, @@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].addr)) goto err; atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } if (byte_count != 0) { @@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - vec = svc_rdma_get_req_map(); - ret = map_xdr(rdma, &rqstp->rq_res, vec); + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec); if (ret) goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ - res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + ret = -ENOMEM; + res_page = alloc_page(GFP_KERNEL); + if (!res_page) + goto err0; rdma_resp = page_address(res_page); reply_ary = svc_rdma_get_reply_array(rdma_argp); if (reply_ary) @@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; err1: put_page(res_page); err0: - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); svc_rdma_put_context(ctxt, 0); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b348b4adef29..5763825d09bf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, + gfp_t flags) { struct svc_rdma_op_ctxt *ctxt; - ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, - GFP_KERNEL | __GFP_NOFAIL); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); + ctxt = kmalloc(sizeof(*ctxt), flags); + if (ctxt) { + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->free); + INIT_LIST_HEAD(&ctxt->dto_q); + } + return ctxt; +} + +static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* Each RPC/RDMA credit can consume a number of send + * and receive WQEs. One ctxt is allocated for each. + */ + i = xprt->sc_sq_depth + xprt->sc_rq_depth; + + while (i--) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = alloc_ctxt(xprt, GFP_KERNEL); + if (!ctxt) { + dprintk("svcrdma: No memory for RDMA ctxt\n"); + return false; + } + list_add(&ctxt->free, &xprt->sc_ctxts); + } + return true; +} + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used++; + if (list_empty(&xprt->sc_ctxts)) + goto out_empty; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del_init(&ctxt->free); + spin_unlock_bh(&xprt->sc_ctxt_lock); + +out: ctxt->count = 0; ctxt->frmr = NULL; - atomic_inc(&xprt->sc_ctxt_used); return ctxt; + +out_empty: + /* Either pre-allocation missed the mark, or send + * queue accounting is broken. + */ + spin_unlock_bh(&xprt->sc_ctxt_lock); + + ctxt = alloc_ctxt(xprt, GFP_NOIO); + if (ctxt) + goto out; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + spin_unlock_bh(&xprt->sc_ctxt_lock); + WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); + return NULL; } void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) @@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches - * the sc_dma_lkey, otherwise, ignore it since it is + * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { atomic_dec(&xprt->sc_dma_used); ib_dma_unmap_page(xprt->sc_cm_id->device, ctxt->sge[i].addr, @@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) { - struct svcxprt_rdma *xprt; + struct svcxprt_rdma *xprt = ctxt->xprt; int i; - xprt = ctxt->xprt; if (free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); - atomic_dec(&xprt->sc_ctxt_used); + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + list_add(&ctxt->free, &xprt->sc_ctxts); + spin_unlock_bh(&xprt->sc_ctxt_lock); } -/* - * Temporary NFS req mappings are shared across all transport - * instances. These are short lived and should be bounded by the number - * of concurrent server threads * depth of the SQ. - */ -struct svc_rdma_req_map *svc_rdma_get_req_map(void) +static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) +{ + while (!list_empty(&xprt->sc_ctxts)) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del(&ctxt->free); + kfree(ctxt); + } +} + +static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) { struct svc_rdma_req_map *map; - map = kmem_cache_alloc(svc_rdma_map_cachep, - GFP_KERNEL | __GFP_NOFAIL); + + map = kmalloc(sizeof(*map), flags); + if (map) + INIT_LIST_HEAD(&map->free); + return map; +} + +static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* One for each receive buffer on this connection. */ + i = xprt->sc_max_requests; + + while (i--) { + struct svc_rdma_req_map *map; + + map = alloc_req_map(GFP_KERNEL); + if (!map) { + dprintk("svcrdma: No memory for request map\n"); + return false; + } + list_add(&map->free, &xprt->sc_maps); + } + return true; +} + +struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_req_map *map = NULL; + + spin_lock(&xprt->sc_map_lock); + if (list_empty(&xprt->sc_maps)) + goto out_empty; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del_init(&map->free); + spin_unlock(&xprt->sc_map_lock); + +out: map->count = 0; return map; + +out_empty: + spin_unlock(&xprt->sc_map_lock); + + /* Pre-allocation amount was incorrect */ + map = alloc_req_map(GFP_NOIO); + if (map) + goto out; + + WARN_ONCE(1, "svcrdma: empty request map list?\n"); + return NULL; +} + +void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, + struct svc_rdma_req_map *map) +{ + spin_lock(&xprt->sc_map_lock); + list_add(&map->free, &xprt->sc_maps); + spin_unlock(&xprt->sc_map_lock); } -void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) { - kmem_cache_free(svc_rdma_map_cachep, map); + while (!list_empty(&xprt->sc_maps)) { + struct svc_rdma_req_map *map; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del(&map->free); + kfree(map); + } } /* ib_cq event handler */ @@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) static void process_context(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *ctxt) { + struct svc_rdma_op_ctxt *read_hdr; + int free_pages = 0; + svc_rdma_unmap_dma(ctxt); switch (ctxt->wr_op) { case IB_WR_SEND: - if (ctxt->frmr) - pr_err("svcrdma: SEND: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 1); + free_pages = 1; break; case IB_WR_RDMA_WRITE: - if (ctxt->frmr) - pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: svc_rdma_put_frmr(xprt, ctxt->frmr); - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - if (read_hdr) { - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - } else { - pr_err("svcrdma: ctxt->read_hdr == NULL\n"); - } - svc_xprt_enqueue(&xprt->sc_xprt); - } + + if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) + break; + + read_hdr = ctxt->read_hdr; svc_rdma_put_context(ctxt, 0); - break; + + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + return; default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d\n", - ctxt->wr_op); + dprintk("svcrdma: unexpected completion opcode=%d\n", + ctxt->wr_op); break; } + + svc_rdma_put_context(ctxt, free_pages); } /* @@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); + INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_maps); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); - - cma_xprt->sc_ord = svcrdma_ord; - - cma_xprt->sc_max_req_size = svcrdma_max_req_size; - cma_xprt->sc_max_requests = svcrdma_max_requests; - cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; - atomic_set(&cma_xprt->sc_sq_count, 0); - atomic_set(&cma_xprt->sc_ctxt_used, 0); + spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_map_lock); if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) { struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; @@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + page = alloc_page(flags); + if (!page) + goto err_put_ctxt; ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count = sge_no + 1; buflen += PAGE_SIZE; } @@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; - int uninitialized_var(dma_mr_acc); - int need_dma_mr = 0; - int ret; - int i; + struct ib_device *dev; + unsigned int i; + int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", newxprt, newxprt->sc_cm_id); - ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); - if (ret) { - dprintk("svcrdma: could not query device attributes on " - "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); - goto errout; - } + dev = newxprt->sc_cm_id->device; /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_sge = min((size_t)devattr.max_sge, + newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); - newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, RPCSVC_MAXPAGES); - newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, - (size_t)svcrdma_max_requests); - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_requests); + newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_bc_requests); + newxprt->sc_rq_depth = newxprt->sc_max_requests + + newxprt->sc_max_bc_requests; + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + + if (!svc_rdma_prealloc_ctxts(newxprt)) + goto errout; + if (!svc_rdma_prealloc_maps(newxprt)) + goto errout; /* * Limit ORD based on client limit, local device limit, and * configured svcrdma limit. */ - newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + newxprt->sc_pd = ib_alloc_pd(dev); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } cq_attr.cqe = newxprt->sc_sq_depth; - newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + newxprt->sc_sq_cq = ib_create_cq(dev, sq_comp_handler, cq_event_handler, newxprt, @@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_max_requests; - newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + cq_attr.cqe = newxprt->sc_rq_depth; + newxprt->sc_rq_cq = ib_create_cq(dev, rq_comp_handler, cq_event_handler, newxprt, @@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; - qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " cap.max_send_sge = %d\n" " cap.max_recv_sge = %d\n", newxprt->sc_cm_id, newxprt->sc_pd, - newxprt->sc_cm_id->device, newxprt->sc_pd->device, + dev, newxprt->sc_pd->device, qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr, qp_attr.cap.max_send_sge, @@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * of an RDMA_READ. IB does not. */ newxprt->sc_reader = rdma_read_chunk_lcl; - if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { newxprt->sc_frmr_pg_list_len = - devattr.max_fast_reg_page_list_len; + dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; } @@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* * Determine if a DMA MR is required and if so, what privs are required */ - if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) goto errout; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || - !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; - } - - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ - if (need_dma_mr) { - /* Register all of physical memory */ - newxprt->sc_phys_mr = - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", - ret); - goto errout; - } - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; - } else - newxprt->sc_dma_lkey = - newxprt->sc_cm_id->device->local_dma_lkey; - /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); + for (i = 0; i < newxprt->sc_rq_depth; i++) { + ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); goto errout; @@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + struct svc_xprt *xprt = &rdma->sc_xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, rdma); /* We should only be called from kref_put */ - if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) + if (atomic_read(&xprt->xpt_ref.refcount) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); + atomic_read(&xprt->xpt_ref.refcount)); /* * Destroy queued, but not processed read completions. Note @@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work) } /* Warn if we leaked a resource or under-referenced */ - if (atomic_read(&rdma->sc_ctxt_used) != 0) + if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", - atomic_read(&rdma->sc_ctxt_used)); + rdma->sc_ctxt_used); if (atomic_read(&rdma->sc_dma_used) != 0) pr_err("svcrdma: dma still in use? (%d)\n", atomic_read(&rdma->sc_dma_used)); - /* De-allocate fastreg mr */ + /* Final put of backchannel client transport */ + if (xprt->xpt_bc_xprt) { + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_bc_xprt = NULL; + } + rdma_dealloc_frmr_q(rdma); + svc_rdma_destroy_ctxts(rdma); + svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) ib_destroy_cq(rdma->sc_rq_cq); - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) - ib_dereg_mr(rdma->sc_phys_mr); - if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); @@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, int length; int ret; - p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + p = alloc_page(GFP_KERNEL); + if (!p) + return; va = page_address(p); /* XDR encode error */ @@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, return; } atomic_inc(&xprt->sc_dma_used); - ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[0].length = length; /* Prepare SEND WR */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7d7525..b1b009f10ea3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -63,7 +63,7 @@ */ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; -static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; @@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = { #endif -#define RPCRDMA_BIND_TO (60U * HZ) -#define RPCRDMA_INIT_REEST_TO (5U * HZ) -#define RPCRDMA_MAX_REEST_TO (30U * HZ) -#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) - -static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ +static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; } -static void +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { char buf[128]; @@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } -static void +void xprt_rdma_free_addresses(struct rpc_xprt *xprt) { unsigned int i; @@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) if (req == NULL) return NULL; - flags = GFP_NOIO | __GFP_NOWARN; + flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; @@ -576,6 +571,9 @@ xprt_rdma_free(void *buffer) rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); req = rb->rg_owner; + if (req->rl_backchannel) + return; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); @@ -639,7 +637,7 @@ drop_connection: return -ENOTCONN; /* implies disconnect */ } -static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); long idle_time = 0; @@ -740,6 +738,11 @@ void xprt_rdma_cleanup(void) rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); + + rc = xprt_unregister_transport(&xprt_rdma_bc); + if (rc) + dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", + __func__, rc); } int xprt_rdma_init(void) @@ -763,6 +766,14 @@ int xprt_rdma_init(void) return rc; } + rc = xprt_register_transport(&xprt_rdma_bc); + if (rc) { + xprt_unregister_transport(&xprt_rdma); + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index eadd1655145a..878f1bfb1db9 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -462,7 +462,6 @@ int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { struct rpcrdma_ia *ia = &xprt->rx_ia; - struct ib_device_attr *devattr = &ia->ri_devattr; int rc; ia->ri_dma_mr = NULL; @@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - rc = ib_query_device(ia->ri_device, devattr); - if (rc) { - dprintk("RPC: %s: ib_query_device failed %d\n", - __func__, rc); - goto out3; - } - if (memreg == RPCRDMA_FRMR) { - if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || - (devattr->max_fast_reg_page_list_len == 0)) { + if (!(ia->ri_device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS) || + (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; @@ -566,24 +559,23 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; struct ib_cq_init_attr cq_attr = {}; unsigned int max_qp_wr; int rc, err; - if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { dprintk("RPC: %s: insufficient sge's available\n", __func__); return -ENOMEM; } - if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { + if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", __func__); return -ENOMEM; } - max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; + max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; /* check provider's send/recv wr limits */ if (cdata->max_requests > max_qp_wr) @@ -616,10 +608,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) - ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; - else if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; /* always signal? */ INIT_CQCOUNT(ep); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); @@ -670,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = - devattr->max_qp_rd_atom; + ia->ri_device->attrs.max_qp_rd_atom; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -852,10 +842,11 @@ retry: if (extras) { rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) + if (rc) { pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", __func__, rc); rc = 0; + } } } @@ -1337,15 +1328,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; - unsigned long flags; int rc; while (count--) { - spin_lock_irqsave(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_recv_bufs)) goto out_reqbuf; rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) @@ -1355,7 +1345,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) return 0; out_reqbuf: - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); pr_warn("%s: no extra receive buffers\n", __func__); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7f8d4f632a..38fe11b09875 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -55,6 +55,11 @@ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + /* * Interface Adapter -- one per transport instance */ @@ -68,7 +73,6 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; - struct ib_device_attr ri_devattr; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -88,12 +92,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -/* - * Force a signaled SEND Work Request every so often, - * in case the provider needs to do some housekeeping. - */ -#define RPCRDMA_MAX_UNSIGNALED_SENDS (32) - #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) @@ -148,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -207,6 +207,12 @@ struct rpcrdma_frmr { enum rpcrdma_frmr_state fr_state; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; + bool fr_waiter; + struct completion fr_linv_done;; + union { + struct ib_reg_wr fr_regwr; + struct ib_send_wr fr_invwr; + }; }; struct rpcrdma_fmr { @@ -309,6 +315,8 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ struct list_head rb_allreqs; + + u32 rb_bc_max_requests; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -364,6 +372,8 @@ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); + void (*ro_unmap_sync)(struct rpcrdma_xprt *, + struct rpcrdma_req *); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); int (*ro_open)(struct rpcrdma_ia *, @@ -514,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *); /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_max_inline_read; +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); +void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -529,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* Temporary NFS request map cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_map_cachep; -/* WR context cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_ctxt_cachep; -/* Workqueue created in svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; +extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2ffaf6a79499..fde2138b81e7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) { - /* - * Notify TCP that we're limited by the application - * window size - */ - set_bit(SOCK_NOSPACE, &transport->sock->flags); - sk->sk_write_pending++; - /* ...and wait for more buffer space */ - xprt_wait_for_buffer_space(task, xs_nospace_callback); - } - } else { - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + /* wait for more buffer space */ + sk->sk_write_pending++; + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } else ret = -ENOTCONN; - } spin_unlock_bh(&xprt->transport_lock); @@ -616,9 +605,6 @@ process_status: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -ENOBUFS: case -EPIPE: @@ -626,7 +612,10 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket *sock; + struct socket_wq *wq; struct rpc_xprt *xprt; - if (unlikely(!(sock = sk->sk_socket))) + if (!sk->sk_socket) return; - clear_bit(SOCK_NOSPACE, &sock->flags); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0) - return; + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) + goto out; xprt_write_space(xprt); +out: + rcu_read_unlock(); } /** @@ -1907,18 +1900,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } } #else -static inline void xs_reclassify_socketu(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket4(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket6(struct socket *sock) -{ -} - static inline void xs_reclassify_socket(int family, struct socket *sock) { } @@ -2008,7 +1989,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) "transport socket (%d).\n", -status); goto out; } - xs_reclassify_socketu(sock); + xs_reclassify_socket(AF_LOCAL, sock); dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index ebc661d3b6e3..47f7da58a7f0 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -20,6 +20,7 @@ #include <linux/list.h> #include <linux/workqueue.h> #include <linux/if_vlan.h> +#include <linux/rtnetlink.h> #include <net/ip_fib.h> #include <net/switchdev.h> @@ -567,7 +568,6 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj, } EXPORT_SYMBOL_GPL(switchdev_port_obj_dump); -static DEFINE_MUTEX(switchdev_mutex); static RAW_NOTIFIER_HEAD(switchdev_notif_chain); /** @@ -582,9 +582,9 @@ int register_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&switchdev_mutex); + rtnl_lock(); err = raw_notifier_chain_register(&switchdev_notif_chain, nb); - mutex_unlock(&switchdev_mutex); + rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(register_switchdev_notifier); @@ -600,9 +600,9 @@ int unregister_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&switchdev_mutex); + rtnl_lock(); err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb); - mutex_unlock(&switchdev_mutex); + rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); @@ -616,16 +616,17 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); * Call all network notifier blocks. This should be called by driver * when it needs to propagate hardware event. * Return values are same as for atomic_notifier_call_chain(). + * rtnl_lock must be held. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info) { int err; + ASSERT_RTNL(); + info->dev = dev; - mutex_lock(&switchdev_mutex); err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); - mutex_unlock(&switchdev_mutex); return err; } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); diff --git a/net/tipc/link.c b/net/tipc/link.c index 0c2944fb9ae0..e31d92f80572 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -123,7 +123,6 @@ struct tipc_stats { struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; - struct tipc_media_addr *media_addr; struct net *net; /* Management and link supervision data */ @@ -904,8 +903,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) return link_schedule_user(l, list); } - if (unlikely(msg_size(hdr) > mtu)) + if (unlikely(msg_size(hdr) > mtu)) { + skb_queue_purge(list); return -EMSGSIZE; + } /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { @@ -917,8 +918,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (likely(skb_queue_len(transmq) < maxwin)) { _skb = skb_clone(skb, GFP_ATOMIC); - if (!_skb) + if (!_skb) { + skb_queue_purge(list); return -ENOBUFS; + } __skb_dequeue(list); __skb_queue_tail(transmq, skb); __skb_queue_tail(xmitq, _skb); @@ -1261,26 +1264,6 @@ drop: return rc; } -/* - * Send protocol message to the other endpoint. - */ -static void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, - int probe_msg, u32 gap, u32 tolerance, - u32 priority) -{ - struct sk_buff *skb = NULL; - struct sk_buff_head xmitq; - - __skb_queue_head_init(&xmitq); - tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap, - tolerance, priority, &xmitq); - skb = __skb_dequeue(&xmitq); - if (!skb) - return; - tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr); - l->rcv_unacked = 0; -} - static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) @@ -1479,6 +1462,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) l->tolerance = peers_tol; + if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI, + TIPC_MAX_LINK_PRI)) { + l->priority = peers_prio; + rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + } + l->silent_intv_cnt = 0; l->stats.recv_states++; if (msg_probe(hdr)) @@ -1973,8 +1962,10 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_LINK_GET); - if (!hdr) + if (!hdr) { + tipc_bcast_unlock(net); return -EMSGSIZE; + } attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); if (!attrs) @@ -2021,16 +2012,18 @@ msg_full: return -EMSGSIZE; } -void tipc_link_set_tolerance(struct tipc_link *l, u32 tol) +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, + struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_proto_xmit(l, STATE_MSG, 0, 0, tol, 0); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq); } -void tipc_link_set_prio(struct tipc_link *l, u32 prio) +void tipc_link_set_prio(struct tipc_link *l, u32 prio, + struct sk_buff_head *xmitq) { l->priority = prio; - tipc_link_proto_xmit(l, STATE_MSG, 0, 0, 0, prio); + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq); } void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) diff --git a/net/tipc/link.h b/net/tipc/link.h index b2ae0f4276af..b4ee9d6e181d 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -112,8 +112,10 @@ char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); unsigned long tipc_link_tolerance(struct tipc_link *l); -void tipc_link_set_tolerance(struct tipc_link *l, u32 tol); -void tipc_link_set_prio(struct tipc_link *l, u32 prio); +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, + struct sk_buff_head *xmitq); +void tipc_link_set_prio(struct tipc_link *l, u32 prio, + struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 91fce70291a8..777b979b8463 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -418,6 +418,9 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, struct tipc_subscription *s) { struct sub_seq *sseq = nseq->sseqs; + struct tipc_name_seq ns; + + tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); list_add(&s->nameseq_list, &nseq->subscriptions); @@ -425,7 +428,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) { + if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; @@ -722,9 +725,10 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, void tipc_nametbl_subscribe(struct tipc_subscription *s) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); - u32 type = s->seq.type; + u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); int index = hash(type); struct name_seq *seq; + struct tipc_name_seq ns; spin_lock_bh(&tn->nametbl_lock); seq = nametbl_find_seq(s->net, type); @@ -735,8 +739,9 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s) tipc_nameseq_subscribe(seq, s); spin_unlock_bh(&seq->lock); } else { + tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); pr_warn("Failed to create subscription for {%u,%u,%u}\n", - s->seq.type, s->seq.lower, s->seq.upper); + ns.type, ns.lower, ns.upper); } spin_unlock_bh(&tn->nametbl_lock); } @@ -748,9 +753,10 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); struct name_seq *seq; + u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, s->seq.type); + seq = nametbl_find_seq(s->net, type); if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 2c016fdefe97..d7d050f44fc1 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1104,8 +1104,8 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) req_nlh = (struct nlmsghdr *)skb->data; msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; - msg.dst_sk = info->dst_sk; msg.net = genl_info_net(info); + msg.dst_sk = skb->sk; if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); diff --git a/net/tipc/node.c b/net/tipc/node.c index fa97d9649a28..cdb79503d890 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -225,9 +225,10 @@ static unsigned int tipc_hashfn(u32 addr) static void tipc_node_kref_release(struct kref *kref) { - struct tipc_node *node = container_of(kref, struct tipc_node, kref); + struct tipc_node *n = container_of(kref, struct tipc_node, kref); - tipc_node_delete(node); + kfree(n->bc_entry.link); + kfree_rcu(n, rcu); } static void tipc_node_put(struct tipc_node *node) @@ -245,23 +246,23 @@ static void tipc_node_get(struct tipc_node *node) */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node; + unsigned int thash = tipc_hashfn(addr); if (unlikely(!in_own_cluster_exact(net, addr))) return NULL; rcu_read_lock(); - hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)], - hash) { - if (node->addr == addr) { - tipc_node_get(node); - rcu_read_unlock(); - return node; - } + hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { + if (node->addr != addr) + continue; + if (!kref_get_unless_zero(&node->kref)) + node = NULL; + break; } rcu_read_unlock(); - return NULL; + return node; } static void tipc_node_read_lock(struct tipc_node *n) @@ -346,12 +347,6 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); - hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); - list_for_each_entry_rcu(temp_node, &tn->node_list, list) { - if (n->addr < temp_node->addr) - break; - } - list_add_tail_rcu(&n->list, &temp_node->list); n->state = SELF_DOWN_PEER_LEAVING; n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; @@ -372,6 +367,12 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) tipc_node_get(n); setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n); n->keepalive_intv = U32_MAX; + hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); exit: spin_unlock_bh(&tn->node_list_lock); return n; @@ -395,21 +396,20 @@ static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); - kfree(node->bc_entry.link); - kfree_rcu(node, rcu); + tipc_node_put(node); + + del_timer_sync(&node->timer); + tipc_node_put(node); } void tipc_node_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) { - if (del_timer(&node->timer)) - tipc_node_put(node); - tipc_node_put(node); - } + list_for_each_entry_safe(node, t_node, &tn->node_list, list) + tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } @@ -530,9 +530,7 @@ static void tipc_node_timeout(unsigned long data) if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } - if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) - tipc_node_get(n); - tipc_node_put(n); + mod_timer(&n->timer, jiffies + n->keepalive_intv); } /** @@ -1166,7 +1164,7 @@ msg_full: * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) @@ -1174,33 +1172,43 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; - int bearer_id = -1; - int rc = -EHOSTUNREACH; + int bearer_id; + int rc; + + if (in_own_node(net, dnode)) { + tipc_sk_rcv(net, list); + return 0; + } - __skb_queue_head_init(&xmitq); n = tipc_node_find(net, dnode); - if (likely(n)) { - tipc_node_read_lock(n); - bearer_id = n->active_links[selector & 1]; - if (bearer_id >= 0) { - le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); - rc = tipc_link_xmit(le->link, list, &xmitq); - spin_unlock_bh(&le->lock); - } + if (unlikely(!n)) { + skb_queue_purge(list); + return -EHOSTUNREACH; + } + + tipc_node_read_lock(n); + bearer_id = n->active_links[selector & 1]; + if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); - if (likely(!rc)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - else if (rc == -ENOBUFS) - tipc_node_link_down(n, bearer_id, false); tipc_node_put(n); - return rc; + skb_queue_purge(list); + return -EHOSTUNREACH; } - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } + __skb_queue_head_init(&xmitq); + le = &n->links[bearer_id]; + spin_lock_bh(&le->lock); + rc = tipc_link_xmit(le->link, list, &xmitq); + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(n); + + if (likely(rc == 0)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + else if (rc == -ENOBUFS) + tipc_node_link_down(n, bearer_id, false); + + tipc_node_put(n); + return rc; } @@ -1637,9 +1645,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) char *name; struct tipc_link *link; struct tipc_node *node; + struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); + __skb_queue_head_init(&xmitq); + if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; @@ -1683,13 +1694,13 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - tipc_link_set_tolerance(link, tol); + tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); - tipc_link_set_prio(link, prio); + tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 win; @@ -1701,7 +1712,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) out: tipc_node_read_unlock(node); - + tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr); return res; } diff --git a/net/tipc/server.c b/net/tipc/server.c index 922e04a43396..2446bfbaa309 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -571,13 +571,13 @@ static void tipc_work_stop(struct tipc_server *s) static int tipc_work_start(struct tipc_server *s) { - s->rcv_wq = alloc_workqueue("tipc_rcv", WQ_UNBOUND, 1); + s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); if (!s->rcv_wq) { pr_err("can't start tipc receive workqueue\n"); return -ENOMEM; } - s->send_wq = alloc_workqueue("tipc_send", WQ_UNBOUND, 1); + s->send_wq = alloc_ordered_workqueue("tipc_send", 0); if (!s->send_wq) { pr_err("can't start tipc send workqueue\n"); destroy_workqueue(s->rcv_wq); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 350cca33ee0a..22963cafd5ed 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -92,25 +92,42 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, +int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper) { - if (found_lower < sub->seq.lower) - found_lower = sub->seq.lower; - if (found_upper > sub->seq.upper) - found_upper = sub->seq.upper; + if (found_lower < seq->lower) + found_lower = seq->lower; + if (found_upper > seq->upper) + found_upper = seq->upper; if (found_lower > found_upper) return 0; return 1; } +u32 tipc_subscrp_convert_seq_type(u32 type, int swap) +{ + return htohl(type, swap); +} + +void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, + struct tipc_name_seq *out) +{ + out->type = htohl(in->type, swap); + out->lower = htohl(in->lower, swap); + out->upper = htohl(in->upper, swap); +} + void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, u32 node, int must) { - if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper)) + struct tipc_name_seq seq; + + tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); + if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; - if (!must && !(sub->filter & TIPC_SUB_PORTS)) + if (!must && + !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS)) return; tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, @@ -171,12 +188,14 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); /* Destroy any existing subscriptions for subscriber */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { - if (del_timer(&sub->timer)) { + timeout = htohl(sub->evt.s.timeout, sub->swap); + if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) { tipc_subscrp_delete(sub); tipc_subscrb_put(subscriber); } @@ -200,13 +219,16 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); /* Find first matching subscription, exit if not found */ list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - if (del_timer(&sub->timer)) { + timeout = htohl(sub->evt.s.timeout, sub->swap); + if ((timeout == TIPC_WAIT_FOREVER) || + del_timer(&sub->timer)) { tipc_subscrp_delete(sub); tipc_subscrb_put(subscriber); } @@ -216,66 +238,67 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s, spin_unlock_bh(&subscriber->lock); } -static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, - struct tipc_subscription **sub_p) +static struct tipc_subscription *tipc_subscrp_create(struct net *net, + struct tipc_subscr *s, + int swap) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; - int swap; - - /* Determine subscriber's endianness */ - swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE)); - - /* Detect & process a subscription cancellation request */ - if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { - s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - tipc_subscrp_cancel(s, subscriber); - return 0; - } + u32 filter = htohl(s->filter, swap); /* Refuse subscription if global limit exceeded */ if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { pr_warn("Subscription rejected, limit reached (%u)\n", TIPC_MAX_SUBSCRIPTIONS); - return -EINVAL; + return NULL; } /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); - return -ENOMEM; + return NULL; } /* Initialize subscription object */ sub->net = net; - sub->seq.type = htohl(s->seq.type, swap); - sub->seq.lower = htohl(s->seq.lower, swap); - sub->seq.upper = htohl(s->seq.upper, swap); - sub->timeout = msecs_to_jiffies(htohl(s->timeout, swap)); - sub->filter = htohl(s->filter, swap); - if ((!(sub->filter & TIPC_SUB_PORTS) == - !(sub->filter & TIPC_SUB_SERVICE)) || - (sub->seq.lower > sub->seq.upper)) { + if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || + (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) { pr_warn("Subscription rejected, illegal request\n"); kfree(sub); - return -EINVAL; + return NULL; } - spin_lock_bh(&subscriber->lock); - list_add(&sub->subscrp_list, &subscriber->subscrp_list); - spin_unlock_bh(&subscriber->lock); - sub->subscriber = subscriber; + sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); + return sub; +} + +static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, int swap) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_subscription *sub = NULL; + u32 timeout; + + sub = tipc_subscrp_create(net, s, swap); + if (!sub) + return tipc_conn_terminate(tn->topsrv, subscriber->conid); + + spin_lock_bh(&subscriber->lock); + list_add(&sub->subscrp_list, &subscriber->subscrp_list); + tipc_subscrb_get(subscriber); + sub->subscriber = subscriber; + tipc_nametbl_subscribe(sub); + spin_unlock_bh(&subscriber->lock); + + timeout = htohl(sub->evt.s.timeout, swap); + if (timeout == TIPC_WAIT_FOREVER) + return; + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); - if (sub->timeout != TIPC_WAIT_FOREVER) - sub->timeout += jiffies; - if (!mod_timer(&sub->timer, sub->timeout)) - tipc_subscrb_get(subscriber); - *sub_p = sub; - return 0; + mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); } /* Handle one termination request for the subscriber */ @@ -290,14 +313,20 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, void *buf, size_t len) { struct tipc_subscriber *subscriber = usr_data; - struct tipc_subscription *sub = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_subscr *s = (struct tipc_subscr *)buf; + int swap; + + /* Determine subscriber's endianness */ + swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | + TIPC_SUB_CANCEL)); + + /* Detect & process a subscription cancellation request */ + if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { + s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); + return tipc_subscrp_cancel(s, subscriber); + } - tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub); - if (sub) - tipc_nametbl_subscribe(sub); - else - tipc_conn_terminate(tn->topsrv, subscriber->conid); + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 92ee18cc5fe6..be60103082c9 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -50,21 +50,15 @@ struct tipc_subscriber; * @subscriber: pointer to its subscriber * @seq: name sequence associated with subscription * @net: point to network namespace - * @timeout: duration of subscription (in ms) - * @filter: event filtering to be done for subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list * @subscrp_list: adjacent subscriptions in subscriber's subscription list - * @server_ref: object reference of server port associated with subscription * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription */ struct tipc_subscription { struct tipc_subscriber *subscriber; - struct tipc_name_seq seq; struct net *net; - unsigned long timeout; - u32 filter; struct timer_list timer; struct list_head nameseq_list; struct list_head subscrp_list; @@ -72,11 +66,14 @@ struct tipc_subscription { struct tipc_event evt; }; -int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, +int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, u32 node, int must); +void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, + struct tipc_name_seq *out); +u32 tipc_subscrp_convert_seq_type(u32 type, int swap); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c5bf5ef2bf89..8269da73e9e5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->fp[i]); + unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1534,7 +1534,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; unsigned char max_level = 0; - int unix_sock_count = 0; if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1542,11 +1541,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) { struct sock *sk = unix_get_socket(scm->fp->fp[i]); - if (sk) { - unix_sock_count++; + if (sk) max_level = max(max_level, unix_sk(sk)->recursion_level); - } } if (unlikely(max_level > MAX_RECURSION_LEVEL)) return -ETOOMANYREFS; @@ -1561,7 +1558,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return -ENOMEM; for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); + unix_inflight(scm->fp->user, scm->fp->fp[i]); return max_level; } @@ -1781,7 +1778,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); @@ -2277,13 +2279,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) size_t size = state->size; unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags & MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); @@ -2305,6 +2309,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) bool drop_skb; struct sk_buff *skb, *last; +redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -2329,9 +2334,11 @@ again: goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; + } + mutex_unlock(&u->readlock); timeo = unix_stream_data_wait(sk, timeo, last, @@ -2339,11 +2346,12 @@ again: if (signal_pending(current)) { err = sock_intr_errno(timeo); + scm_destroy(&scm); goto out; } mutex_lock(&u->readlock); - continue; + goto redo; unlock: unix_state_unlock(sk); break; diff --git a/net/unix/diag.c b/net/unix/diag.c index c512f64d5287..4d9679701a6d 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -220,7 +220,7 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(int ino) +static struct sock *unix_lookup_by_ino(unsigned int ino) { int i; struct sock *sk; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8fcdc2283af5..6a0d48525fcf 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp) * descriptor if it is for an AF_UNIX socket. */ -void unix_inflight(struct file *fp) +void unix_inflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -133,11 +133,11 @@ void unix_inflight(struct file *fp) } unix_tot_inflight++; } - fp->f_cred->user->unix_inflight++; + user->unix_inflight++; spin_unlock(&unix_gc_lock); } -void unix_notinflight(struct file *fp) +void unix_notinflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp) list_del_init(&u->link); unix_tot_inflight--; } - fp->f_cred->user->unix_inflight--; + user->unix_inflight--; spin_unlock(&unix_gc_lock); } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7fd1220fbfa0..bbe65dcb9738 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1557,8 +1557,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (total_written < len) { ssize_t written; @@ -1578,7 +1576,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); @@ -1588,8 +1588,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; } - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } /* These checks occur both as part of and after the loop @@ -1635,7 +1633,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, out_wait: if (total_written > 0) err = total_written; - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; @@ -1716,7 +1713,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (1) { s64 ready = vsock_stream_has_data(vsk); @@ -1727,7 +1723,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, */ err = -ENOMEM; - goto out_wait; + goto out; } else if (ready > 0) { ssize_t read; @@ -1750,7 +1746,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) - goto out_wait; + goto out; if (read >= target || flags & MSG_PEEK) break; @@ -1773,7 +1769,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, break; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { @@ -1783,9 +1781,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = -EAGAIN; break; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } } @@ -1816,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = copied; } -out_wait: - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bc76b281ed3a..c5fb317eee68 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -222,20 +222,22 @@ static const struct ieee80211_regdomain world_regdom = { /* IEEE 802.11b/g, channels 1..11 */ REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), /* IEEE 802.11b/g, channels 12..13. */ - REG_RULE(2467-10, 2472+10, 40, 6, 20, - NL80211_RRF_NO_IR), + REG_RULE(2467-10, 2472+10, 20, 6, 20, + NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW), /* IEEE 802.11 channel 14 - Only JP enables * this and for 802.11b only */ REG_RULE(2484-10, 2484+10, 20, 6, 20, NL80211_RRF_NO_IR | NL80211_RRF_NO_OFDM), /* IEEE 802.11a, channel 36..48 */ - REG_RULE(5180-10, 5240+10, 160, 6, 20, - NL80211_RRF_NO_IR), + REG_RULE(5180-10, 5240+10, 80, 6, 20, + NL80211_RRF_NO_IR | + NL80211_RRF_AUTO_BW), /* IEEE 802.11a, channel 52..64 - DFS required */ - REG_RULE(5260-10, 5320+10, 160, 6, 20, + REG_RULE(5260-10, 5320+10, 80, 6, 20, NL80211_RRF_NO_IR | + NL80211_RRF_AUTO_BW | NL80211_RRF_DFS), /* IEEE 802.11a, channel 100..144 - DFS required */ @@ -2692,7 +2694,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) const struct ieee80211_power_rule *power_rule = NULL; char bw[32], cac_time[32]; - pr_info(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n"); + pr_debug(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n"); for (i = 0; i < rd->n_reg_rules; i++) { reg_rule = &rd->reg_rules[i]; @@ -2719,7 +2721,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) * in certain regions */ if (power_rule->max_antenna_gain) - pr_info(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n", + pr_debug(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, @@ -2727,7 +2729,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd) power_rule->max_eirp, cac_time); else - pr_info(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n", + pr_debug(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n", freq_range->start_freq_khz, freq_range->end_freq_khz, bw, @@ -2759,35 +2761,35 @@ static void print_regdomain(const struct ieee80211_regdomain *rd) struct cfg80211_registered_device *rdev; rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx); if (rdev) { - pr_info("Current regulatory domain updated by AP to: %c%c\n", + pr_debug("Current regulatory domain updated by AP to: %c%c\n", rdev->country_ie_alpha2[0], rdev->country_ie_alpha2[1]); } else - pr_info("Current regulatory domain intersected:\n"); + pr_debug("Current regulatory domain intersected:\n"); } else - pr_info("Current regulatory domain intersected:\n"); + pr_debug("Current regulatory domain intersected:\n"); } else if (is_world_regdom(rd->alpha2)) { - pr_info("World regulatory domain updated:\n"); + pr_debug("World regulatory domain updated:\n"); } else { if (is_unknown_alpha2(rd->alpha2)) - pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n"); + pr_debug("Regulatory domain changed to driver built-in settings (unknown country)\n"); else { if (reg_request_cell_base(lr)) - pr_info("Regulatory domain changed to country: %c%c by Cell Station\n", + pr_debug("Regulatory domain changed to country: %c%c by Cell Station\n", rd->alpha2[0], rd->alpha2[1]); else - pr_info("Regulatory domain changed to country: %c%c\n", + pr_debug("Regulatory domain changed to country: %c%c\n", rd->alpha2[0], rd->alpha2[1]); } } - pr_info(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region)); + pr_debug(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region)); print_rd_rules(rd); } static void print_regdomain_info(const struct ieee80211_regdomain *rd) { - pr_info("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]); + pr_debug("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]); print_rd_rules(rd); } @@ -2808,7 +2810,8 @@ static int reg_set_rd_user(const struct ieee80211_regdomain *rd, return -EALREADY; if (!is_valid_rd(rd)) { - pr_err("Invalid regulatory domain detected:\n"); + pr_err("Invalid regulatory domain detected: %c%c\n", + rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } @@ -2844,7 +2847,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, return -EALREADY; if (!is_valid_rd(rd)) { - pr_err("Invalid regulatory domain detected:\n"); + pr_err("Invalid regulatory domain detected: %c%c\n", + rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } @@ -2902,7 +2906,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, */ if (!is_valid_rd(rd)) { - pr_err("Invalid regulatory domain detected:\n"); + pr_err("Invalid regulatory domain detected: %c%c\n", + rd->alpha2[0], rd->alpha2[1]); print_regdomain_info(rd); return -EINVAL; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index cc3676eb6239..ff4a91fcab9f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -167,6 +167,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb { struct sk_buff *segs; + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) |