summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core/addr.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core/addr.c')
-rw-r--r--drivers/infiniband/core/addr.c406
1 files changed, 252 insertions, 154 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 46b855a42884..0dce94e3c495 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -45,6 +45,7 @@
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
#include <rdma/ib.h>
#include <rdma/rdma_netlink.h>
#include <net/netlink.h>
@@ -61,6 +62,7 @@ struct addr_req {
struct rdma_dev_addr *addr, void *context);
unsigned long timeout;
struct delayed_work work;
+ bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */
int status;
u32 seq;
};
@@ -219,60 +221,75 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
}
EXPORT_SYMBOL(rdma_addr_size_kss);
-void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
- const struct net_device *dev,
- const unsigned char *dst_dev_addr)
+/**
+ * rdma_copy_src_l2_addr - Copy netdevice source addresses
+ * @dev_addr: Destination address pointer where to copy the addresses
+ * @dev: Netdevice whose source addresses to copy
+ *
+ * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice.
+ * This includes unicast address, broadcast address, device type and
+ * interface index.
+ */
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+ const struct net_device *dev)
{
dev_addr->dev_type = dev->type;
memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
- if (dst_dev_addr)
- memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
dev_addr->bound_dev_if = dev->ifindex;
}
-EXPORT_SYMBOL(rdma_copy_addr);
+EXPORT_SYMBOL(rdma_copy_src_l2_addr);
-int rdma_translate_ip(const struct sockaddr *addr,
- struct rdma_dev_addr *dev_addr)
+static struct net_device *
+rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in)
{
- struct net_device *dev;
+ struct net_device *dev = NULL;
+ int ret = -EADDRNOTAVAIL;
- if (dev_addr->bound_dev_if) {
- dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
- if (!dev)
- return -ENODEV;
- rdma_copy_addr(dev_addr, dev, NULL);
- dev_put(dev);
- return 0;
- }
-
- switch (addr->sa_family) {
+ switch (src_in->sa_family) {
case AF_INET:
- dev = ip_dev_find(dev_addr->net,
- ((const struct sockaddr_in *)addr)->sin_addr.s_addr);
-
- if (!dev)
- return -EADDRNOTAVAIL;
-
- rdma_copy_addr(dev_addr, dev, NULL);
- dev_put(dev);
+ dev = __ip_dev_find(net,
+ ((const struct sockaddr_in *)src_in)->sin_addr.s_addr,
+ false);
+ if (dev)
+ ret = 0;
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- rcu_read_lock();
- for_each_netdev_rcu(dev_addr->net, dev) {
- if (ipv6_chk_addr(dev_addr->net,
- &((const struct sockaddr_in6 *)addr)->sin6_addr,
+ for_each_netdev_rcu(net, dev) {
+ if (ipv6_chk_addr(net,
+ &((const struct sockaddr_in6 *)src_in)->sin6_addr,
dev, 1)) {
- rdma_copy_addr(dev_addr, dev, NULL);
+ ret = 0;
break;
}
}
- rcu_read_unlock();
break;
#endif
}
- return 0;
+ return ret ? ERR_PTR(ret) : dev;
+}
+
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr)
+{
+ struct net_device *dev;
+
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ rdma_copy_src_l2_addr(dev_addr, dev);
+ dev_put(dev);
+ return 0;
+ }
+
+ rcu_read_lock();
+ dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr);
+ if (!IS_ERR(dev))
+ rdma_copy_src_l2_addr(dev_addr, dev);
+ rcu_read_unlock();
+ return PTR_ERR_OR_ZERO(dev);
}
EXPORT_SYMBOL(rdma_translate_ip);
@@ -295,15 +312,12 @@ static void queue_req(struct addr_req *req)
spin_unlock_bh(&lock);
}
-static int ib_nl_fetch_ha(const struct dst_entry *dst,
- struct rdma_dev_addr *dev_addr,
+static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr,
const void *daddr, u32 seq, u16 family)
{
- if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
+ if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
return -EADDRNOTAVAIL;
- /* We fill in what we can, the response will fill the rest */
- rdma_copy_addr(dev_addr, dst->dev, NULL);
return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
}
@@ -322,7 +336,7 @@ static int dst_fetch_ha(const struct dst_entry *dst,
neigh_event_send(n, NULL);
ret = -ENODATA;
} else {
- rdma_copy_addr(dev_addr, dst->dev, n->ha);
+ memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN);
}
neigh_release(n);
@@ -356,18 +370,22 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
(const void *)&dst_in6->sin6_addr;
sa_family_t family = dst_in->sa_family;
- /* Gateway + ARPHRD_INFINIBAND -> IB router */
- if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
- return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+ /* If we have a gateway in IB mode then it must be an IB network */
+ if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB)
+ return ib_nl_fetch_ha(dev_addr, daddr, seq, family);
else
return dst_fetch_ha(dst, dev_addr, daddr);
}
-static int addr4_resolve(struct sockaddr_in *src_in,
- const struct sockaddr_in *dst_in,
+static int addr4_resolve(struct sockaddr *src_sock,
+ const struct sockaddr *dst_sock,
struct rdma_dev_addr *addr,
struct rtable **prt)
{
+ struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock;
+ const struct sockaddr_in *dst_in =
+ (const struct sockaddr_in *)dst_sock;
+
__be32 src_ip = src_in->sin_addr.s_addr;
__be32 dst_ip = dst_in->sin_addr.s_addr;
struct rtable *rt;
@@ -383,16 +401,8 @@ static int addr4_resolve(struct sockaddr_in *src_in,
if (ret)
return ret;
- src_in->sin_family = AF_INET;
src_in->sin_addr.s_addr = fl4.saddr;
- /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
- * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
- * type accordingly.
- */
- if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
- addr->network = RDMA_NETWORK_IPV4;
-
addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
*prt = rt;
@@ -400,14 +410,16 @@ static int addr4_resolve(struct sockaddr_in *src_in,
}
#if IS_ENABLED(CONFIG_IPV6)
-static int addr6_resolve(struct sockaddr_in6 *src_in,
- const struct sockaddr_in6 *dst_in,
+static int addr6_resolve(struct sockaddr *src_sock,
+ const struct sockaddr *dst_sock,
struct rdma_dev_addr *addr,
struct dst_entry **pdst)
{
+ struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock;
+ const struct sockaddr_in6 *dst_in =
+ (const struct sockaddr_in6 *)dst_sock;
struct flowi6 fl6;
struct dst_entry *dst;
- struct rt6_info *rt;
int ret;
memset(&fl6, 0, sizeof fl6);
@@ -419,19 +431,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
if (ret < 0)
return ret;
- rt = (struct rt6_info *)dst;
- if (ipv6_addr_any(&src_in->sin6_addr)) {
- src_in->sin6_family = AF_INET6;
+ if (ipv6_addr_any(&src_in->sin6_addr))
src_in->sin6_addr = fl6.saddr;
- }
-
- /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
- * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
- * type accordingly.
- */
- if (rt->rt6i_flags & RTF_GATEWAY &&
- ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
- addr->network = RDMA_NETWORK_IPV6;
addr->hoplimit = ip6_dst_hoplimit(dst);
@@ -439,8 +440,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
return 0;
}
#else
-static int addr6_resolve(struct sockaddr_in6 *src_in,
- const struct sockaddr_in6 *dst_in,
+static int addr6_resolve(struct sockaddr *src_sock,
+ const struct sockaddr *dst_sock,
struct rdma_dev_addr *addr,
struct dst_entry **pdst)
{
@@ -451,36 +452,110 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
static int addr_resolve_neigh(const struct dst_entry *dst,
const struct sockaddr *dst_in,
struct rdma_dev_addr *addr,
+ unsigned int ndev_flags,
u32 seq)
{
- if (dst->dev->flags & IFF_LOOPBACK) {
- int ret;
+ int ret = 0;
- ret = rdma_translate_ip(dst_in, addr);
- if (!ret)
- memcpy(addr->dst_dev_addr, addr->src_dev_addr,
- MAX_ADDR_LEN);
+ if (ndev_flags & IFF_LOOPBACK) {
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+ } else {
+ if (!(ndev_flags & IFF_NOARP)) {
+ /* If the device doesn't do ARP internally */
+ ret = fetch_ha(dst, addr, dst_in, seq);
+ }
+ }
+ return ret;
+}
- return ret;
+static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+ const struct sockaddr *dst_in,
+ const struct dst_entry *dst,
+ const struct net_device *ndev)
+{
+ int ret = 0;
+
+ if (dst->dev->flags & IFF_LOOPBACK)
+ ret = rdma_translate_ip(dst_in, dev_addr);
+ else
+ rdma_copy_src_l2_addr(dev_addr, dst->dev);
+
+ /*
+ * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+ * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+ * network type accordingly.
+ */
+ if (has_gateway(dst, dst_in->sa_family) &&
+ ndev->type != ARPHRD_INFINIBAND)
+ dev_addr->network = dst_in->sa_family == AF_INET ?
+ RDMA_NETWORK_IPV4 :
+ RDMA_NETWORK_IPV6;
+ else
+ dev_addr->network = RDMA_NETWORK_IB;
+
+ return ret;
+}
+
+static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
+ unsigned int *ndev_flags,
+ const struct sockaddr *dst_in,
+ const struct dst_entry *dst)
+{
+ struct net_device *ndev = READ_ONCE(dst->dev);
+
+ *ndev_flags = ndev->flags;
+ /* A physical device must be the RDMA device to use */
+ if (ndev->flags & IFF_LOOPBACK) {
+ /*
+ * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
+ * loopback IP address. So if route is resolved to loopback
+ * interface, translate that to a real ndev based on non
+ * loopback IP address.
+ */
+ ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
+ if (IS_ERR(ndev))
+ return -ENODEV;
}
- /* If the device doesn't do ARP internally */
- if (!(dst->dev->flags & IFF_NOARP))
- return fetch_ha(dst, addr, dst_in, seq);
+ return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+}
+
+static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
+{
+ struct net_device *ndev;
- rdma_copy_addr(addr, dst->dev, NULL);
+ ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr);
+ if (IS_ERR(ndev))
+ return PTR_ERR(ndev);
+ /*
+ * Since we are holding the rcu, reading net and ifindex
+ * are safe without any additional reference; because
+ * change_net_namespace() in net/core/dev.c does rcu sync
+ * after it changes the state to IFF_DOWN and before
+ * updating netdev fields {net, ifindex}.
+ */
+ addr->net = dev_net(ndev);
+ addr->bound_dev_if = ndev->ifindex;
return 0;
}
+static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
+{
+ addr->net = &init_net;
+ addr->bound_dev_if = 0;
+}
+
static int addr_resolve(struct sockaddr *src_in,
const struct sockaddr *dst_in,
struct rdma_dev_addr *addr,
bool resolve_neigh,
+ bool resolve_by_gid_attr,
u32 seq)
{
- struct net_device *ndev;
- struct dst_entry *dst;
+ struct dst_entry *dst = NULL;
+ unsigned int ndev_flags = 0;
+ struct rtable *rt = NULL;
int ret;
if (!addr->net) {
@@ -488,58 +563,55 @@ static int addr_resolve(struct sockaddr *src_in,
return -EINVAL;
}
- if (src_in->sa_family == AF_INET) {
- struct rtable *rt = NULL;
- const struct sockaddr_in *dst_in4 =
- (const struct sockaddr_in *)dst_in;
-
- ret = addr4_resolve((struct sockaddr_in *)src_in,
- dst_in4, addr, &rt);
- if (ret)
- return ret;
-
- if (resolve_neigh)
- ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
-
- if (addr->bound_dev_if) {
- ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
- } else {
- ndev = rt->dst.dev;
- dev_hold(ndev);
+ rcu_read_lock();
+ if (resolve_by_gid_attr) {
+ if (!addr->sgid_attr) {
+ rcu_read_unlock();
+ pr_warn_ratelimited("%s: missing gid_attr\n", __func__);
+ return -EINVAL;
}
-
- ip_rt_put(rt);
- } else {
- const struct sockaddr_in6 *dst_in6 =
- (const struct sockaddr_in6 *)dst_in;
-
- ret = addr6_resolve((struct sockaddr_in6 *)src_in,
- dst_in6, addr,
- &dst);
- if (ret)
+ /*
+ * If the request is for a specific gid attribute of the
+ * rdma_dev_addr, derive net from the netdevice of the
+ * GID attribute.
+ */
+ ret = set_addr_netns_by_gid_rcu(addr);
+ if (ret) {
+ rcu_read_unlock();
return ret;
-
- if (resolve_neigh)
- ret = addr_resolve_neigh(dst, dst_in, addr, seq);
-
- if (addr->bound_dev_if) {
- ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
- } else {
- ndev = dst->dev;
- dev_hold(ndev);
}
-
- dst_release(dst);
}
-
- if (ndev) {
- if (ndev->flags & IFF_LOOPBACK)
- ret = rdma_translate_ip(dst_in, addr);
- else
- addr->bound_dev_if = ndev->ifindex;
- dev_put(ndev);
+ if (src_in->sa_family == AF_INET) {
+ ret = addr4_resolve(src_in, dst_in, addr, &rt);
+ dst = &rt->dst;
+ } else {
+ ret = addr6_resolve(src_in, dst_in, addr, &dst);
}
+ if (ret) {
+ rcu_read_unlock();
+ goto done;
+ }
+ ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+ rcu_read_unlock();
+
+ /*
+ * Resolve neighbor destination address if requested and
+ * only if src addr translation didn't fail.
+ */
+ if (!ret && resolve_neigh)
+ ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
+ if (src_in->sa_family == AF_INET)
+ ip_rt_put(rt);
+ else
+ dst_release(dst);
+done:
+ /*
+ * Clear the addr net to go back to its original state, only if it was
+ * derived from GID attribute in this context.
+ */
+ if (resolve_by_gid_attr)
+ rdma_addr_set_net_defaults(addr);
return ret;
}
@@ -554,7 +626,8 @@ static void process_one_req(struct work_struct *_work)
src_in = (struct sockaddr *)&req->src_addr;
dst_in = (struct sockaddr *)&req->dst_addr;
req->status = addr_resolve(src_in, dst_in, req->addr,
- true, req->seq);
+ true, req->resolve_by_gid_attr,
+ req->seq);
if (req->status && time_after_eq(jiffies, req->timeout)) {
req->status = -ETIMEDOUT;
} else if (req->status == -ENODATA) {
@@ -586,10 +659,10 @@ static void process_one_req(struct work_struct *_work)
}
int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
- struct rdma_dev_addr *addr, int timeout_ms,
+ struct rdma_dev_addr *addr, unsigned long timeout_ms,
void (*callback)(int status, struct sockaddr *src_addr,
struct rdma_dev_addr *addr, void *context),
- void *context)
+ bool resolve_by_gid_attr, void *context)
{
struct sockaddr *src_in, *dst_in;
struct addr_req *req;
@@ -617,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
req->addr = addr;
req->callback = callback;
req->context = context;
+ req->resolve_by_gid_attr = resolve_by_gid_attr;
INIT_DELAYED_WORK(&req->work, process_one_req);
req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
- req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
+ req->status = addr_resolve(src_in, dst_in, addr, true,
+ req->resolve_by_gid_attr, req->seq);
switch (req->status) {
case 0:
req->timeout = jiffies;
@@ -641,25 +716,53 @@ err:
}
EXPORT_SYMBOL(rdma_resolve_ip);
-int rdma_resolve_ip_route(struct sockaddr *src_addr,
- const struct sockaddr *dst_addr,
- struct rdma_dev_addr *addr)
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+ const struct ib_gid_attr *attr)
{
- struct sockaddr_storage ssrc_addr = {};
- struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid, dgid;
+ struct rdma_dev_addr dev_addr = {};
+ int ret;
- if (src_addr) {
- if (src_addr->sa_family != dst_addr->sa_family)
- return -EINVAL;
+ if (rec->roce.route_resolved)
+ return 0;
- memcpy(src_in, src_addr, rdma_addr_size(src_addr));
- } else {
- src_in->sa_family = dst_addr->sa_family;
- }
+ rdma_gid2ip(&sgid._sockaddr, &rec->sgid);
+ rdma_gid2ip(&dgid._sockaddr, &rec->dgid);
+
+ if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family)
+ return -EINVAL;
+
+ if (!attr || !attr->ndev)
+ return -EINVAL;
+
+ dev_addr.net = &init_net;
+ dev_addr.sgid_attr = attr;
+
+ ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr,
+ &dev_addr, false, true, 0);
+ if (ret)
+ return ret;
+
+ if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+ dev_addr.network == RDMA_NETWORK_IPV6) &&
+ rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
+ return -EINVAL;
- return addr_resolve(src_in, dst_addr, addr, false, 0);
+ rec->roce.route_resolved = true;
+ return 0;
}
+/**
+ * rdma_addr_cancel - Cancel resolve ip request
+ * @addr: Pointer to address structure given previously
+ * during rdma_resolve_ip().
+ * rdma_addr_cancel() is synchronous function which cancels any pending
+ * request if there is any.
+ */
void rdma_addr_cancel(struct rdma_dev_addr *addr)
{
struct addr_req *req, *temp_req;
@@ -687,11 +790,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
* guarentees no work is running and none will be started.
*/
cancel_delayed_work_sync(&found->work);
-
- if (found->callback)
- found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr,
- found->addr, found->context);
-
kfree(found);
}
EXPORT_SYMBOL(rdma_addr_cancel);
@@ -710,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
const union ib_gid *dgid,
- u8 *dmac, const struct net_device *ndev,
+ u8 *dmac, const struct ib_gid_attr *sgid_attr,
int *hoplimit)
{
struct rdma_dev_addr dev_addr;
@@ -726,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
rdma_gid2ip(&dgid_addr._sockaddr, dgid);
memset(&dev_addr, 0, sizeof(dev_addr));
- dev_addr.bound_dev_if = ndev->ifindex;
dev_addr.net = &init_net;
+ dev_addr.sgid_attr = sgid_attr;
init_completion(&ctx.comp);
ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr,
- &dev_addr, 1000, resolve_cb, &ctx);
+ &dev_addr, 1000, resolve_cb, true, &ctx);
if (ret)
return ret;