From 7025fcd36bd62af2c6ca0ea3490c00b216c4d168 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:28 -0700 Subject: IB: address translation to map IP toIB addresses (GIDs) Add an address translation service that maps IP addresses to InfiniBand GID addresses using IPoIB. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 5 + drivers/infiniband/core/Makefile | 6 +- drivers/infiniband/core/addr.c | 367 +++++++++++++++++++++++++++++++++++++++ include/rdma/ib_addr.h | 114 ++++++++++++ 4 files changed, 491 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/addr.c create mode 100644 include/rdma/ib_addr.h diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index afc612b8577dc..ba2d6505e9a45 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -29,6 +29,11 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from . +config INFINIBAND_ADDR_TRANS + bool + depends on INFINIBAND && INET + default y + source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/ipath/Kconfig" diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index a56bf9c994f66..05095919d1682 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,5 +1,7 @@ +addr_trans-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o + obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o + ib_cm.o $(addr_trans-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o @@ -12,6 +14,8 @@ ib_sa-y := sa_query.o ib_cm-y := cm.o +ib_addr-y := addr.o + ib_umad-y := user_mad.o ib_ucm-y := ucm.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c new file mode 100644 index 0000000000000..d294bbc42f091 --- /dev/null +++ b/drivers/infiniband/core/addr.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("IB Address Translation"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct addr_req { + struct list_head list; + struct sockaddr src_addr; + struct sockaddr dst_addr; + struct rdma_dev_addr *addr; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + int status; +}; + +static void process_req(void *data); + +static DEFINE_MUTEX(lock); +static LIST_HEAD(req_list); +static DECLARE_WORK(work, process_req, NULL); +static struct workqueue_struct *addr_wq; + +static int copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, + unsigned char *dst_dev_addr) +{ + switch (dev->type) { + case ARPHRD_INFINIBAND: + dev_addr->dev_type = IB_NODE_CA; + break; + default: + return -EADDRNOTAVAIL; + } + + memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); + if (dst_dev_addr) + memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); + return 0; +} + +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + struct net_device *dev; + u32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr; + int ret; + + dev = ip_dev_find(ip); + if (!dev) + return -EADDRNOTAVAIL; + + ret = copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_translate_ip); + +static void set_timeout(unsigned long time) +{ + unsigned long delay; + + cancel_delayed_work(&work); + + delay = time - jiffies; + if ((long)delay <= 0) + delay = 1; + + queue_delayed_work(addr_wq, &work, delay); +} + +static void queue_req(struct addr_req *req) +{ + struct addr_req *temp_req; + + mutex_lock(&lock); + list_for_each_entry_reverse(temp_req, &req_list, list) { + if (time_after(req->timeout, temp_req->timeout)) + break; + } + + list_add(&req->list, &temp_req->list); + + if (req_list.next == &req->list) + set_timeout(req->timeout); + mutex_unlock(&lock); +} + +static void addr_send_arp(struct sockaddr_in *dst_in) +{ + struct rtable *rt; + struct flowi fl; + u32 dst_ip = dst_in->sin_addr.s_addr; + + memset(&fl, 0, sizeof fl); + fl.nl_u.ip4_u.daddr = dst_ip; + if (ip_route_output_key(&rt, &fl)) + return; + + arp_send(ARPOP_REQUEST, ETH_P_ARP, rt->rt_gateway, rt->idev->dev, + rt->rt_src, NULL, rt->idev->dev->dev_addr, NULL); + ip_rt_put(rt); +} + +static int addr_resolve_remote(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) +{ + u32 src_ip = src_in->sin_addr.s_addr; + u32 dst_ip = dst_in->sin_addr.s_addr; + struct flowi fl; + struct rtable *rt; + struct neighbour *neigh; + int ret; + + memset(&fl, 0, sizeof fl); + fl.nl_u.ip4_u.daddr = dst_ip; + fl.nl_u.ip4_u.saddr = src_ip; + ret = ip_route_output_key(&rt, &fl); + if (ret) + goto out; + + /* If the device does ARP internally, return 'done' */ + if (rt->idev->dev->flags & IFF_NOARP) { + copy_addr(addr, rt->idev->dev, NULL); + goto put; + } + + neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); + if (!neigh) { + ret = -ENODATA; + goto put; + } + + if (!(neigh->nud_state & NUD_VALID)) { + ret = -ENODATA; + goto release; + } + + if (!src_ip) { + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = rt->rt_src; + } + + ret = copy_addr(addr, neigh->dev, neigh->ha); +release: + neigh_release(neigh); +put: + ip_rt_put(rt); +out: + return ret; +} + +static void process_req(void *data) +{ + struct addr_req *req, *temp_req; + struct sockaddr_in *src_in, *dst_in; + struct list_head done_list; + + INIT_LIST_HEAD(&done_list); + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->status) { + src_in = (struct sockaddr_in *) &req->src_addr; + dst_in = (struct sockaddr_in *) &req->dst_addr; + req->status = addr_resolve_remote(src_in, dst_in, + req->addr); + } + if (req->status && time_after(jiffies, req->timeout)) + req->status = -ETIMEDOUT; + else if (req->status == -ENODATA) + continue; + + list_del(&req->list); + list_add_tail(&req->list, &done_list); + } + + if (!list_empty(&req_list)) { + req = list_entry(req_list.next, struct addr_req, list); + set_timeout(req->timeout); + } + mutex_unlock(&lock); + + list_for_each_entry_safe(req, temp_req, &done_list, list) { + list_del(&req->list); + req->callback(req->status, &req->src_addr, req->addr, + req->context); + kfree(req); + } +} + +static int addr_resolve_local(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) +{ + struct net_device *dev; + u32 src_ip = src_in->sin_addr.s_addr; + u32 dst_ip = dst_in->sin_addr.s_addr; + int ret; + + dev = ip_dev_find(dst_ip); + if (!dev) + return -EADDRNOTAVAIL; + + if (ZERONET(src_ip)) { + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = dst_ip; + ret = copy_addr(addr, dev, dev->dev_addr); + } else if (LOOPBACK(src_ip)) { + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } else { + ret = rdma_translate_ip((struct sockaddr *)src_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } + + dev_put(dev); + return ret; +} + +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context) +{ + struct sockaddr_in *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = kmalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + memset(req, 0, sizeof *req); + + if (src_addr) + memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr)); + memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + + src_in = (struct sockaddr_in *) &req->src_addr; + dst_in = (struct sockaddr_in *) &req->dst_addr; + + req->status = addr_resolve_local(src_in, dst_in, addr); + if (req->status == -EADDRNOTAVAIL) + req->status = addr_resolve_remote(src_in, dst_in, addr); + + switch (req->status) { + case 0: + req->timeout = jiffies; + queue_req(req); + break; + case -ENODATA: + req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; + queue_req(req); + addr_send_arp(dst_in); + break; + default: + ret = req->status; + kfree(req); + break; + } + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ip); + +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *temp_req; + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->addr == addr) { + req->status = -ECANCELED; + req->timeout = jiffies; + list_del(&req->list); + list_add(&req->list, &req_list); + set_timeout(req->timeout); + break; + } + } + mutex_unlock(&lock); +} +EXPORT_SYMBOL(rdma_addr_cancel); + +static int addr_arp_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pkt, struct net_device *orig_dev) +{ + struct arphdr *arp_hdr; + + arp_hdr = (struct arphdr *) skb->nh.raw; + + if (arp_hdr->ar_op == htons(ARPOP_REQUEST) || + arp_hdr->ar_op == htons(ARPOP_REPLY)) + set_timeout(jiffies); + + kfree_skb(skb); + return 0; +} + +static struct packet_type addr_arp = { + .type = __constant_htons(ETH_P_ARP), + .func = addr_arp_recv, + .af_packet_priv = (void*) 1, +}; + +static int addr_init(void) +{ + addr_wq = create_singlethread_workqueue("ib_addr_wq"); + if (!addr_wq) + return -ENOMEM; + + dev_add_pack(&addr_arp); + return 0; +} + +static void addr_cleanup(void) +{ + dev_remove_pack(&addr_arp); + destroy_workqueue(addr_wq); +} + +module_init(addr_init); +module_exit(addr_cleanup); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h new file mode 100644 index 0000000000000..fcb5ba87dcc5f --- /dev/null +++ b/include/rdma/ib_addr.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + */ + +#if !defined(IB_ADDR_H) +#define IB_ADDR_H + +#include +#include +#include +#include +#include + +struct rdma_dev_addr { + unsigned char src_dev_addr[MAX_ADDR_LEN]; + unsigned char dst_dev_addr[MAX_ADDR_LEN]; + unsigned char broadcast[MAX_ADDR_LEN]; + enum ib_node_type dev_type; +}; + +/** + * rdma_translate_ip - Translate a local IP address to an RDMA hardware + * address. + */ +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr); + +/** + * rdma_resolve_ip - Resolve source and destination IP addresses to + * RDMA hardware addresses. + * @src_addr: An optional source address to use in the resolution. If a + * source address is not provided, a usable address will be returned via + * the callback. + * @dst_addr: The destination address to resolve. + * @addr: A reference to a data location that will receive the resolved + * addresses. The data location must remain valid until the callback has + * been invoked. + * @timeout_ms: Amount of time to wait for the address resolution to complete. + * @callback: Call invoked once address resolution has completed, timed out, + * or been canceled. A status of 0 indicates success. + * @context: User-specified context associated with the call. + */ +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context); + +void rdma_addr_cancel(struct rdma_dev_addr *addr); + +static inline int ip_addr_size(struct sockaddr *addr) +{ + return addr->sa_family == AF_INET6 ? + sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); +} + +static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) +{ + return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; +} + +static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) +{ + dev_addr->broadcast[8] = pkey >> 8; + dev_addr->broadcast[9] = (unsigned char) pkey; +} + +static inline union ib_gid *ib_addr_get_sgid(struct rdma_dev_addr *dev_addr) +{ + return (union ib_gid *) (dev_addr->src_dev_addr + 4); +} + +static inline void ib_addr_set_sgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(dev_addr->src_dev_addr + 4, gid, sizeof *gid); +} + +static inline union ib_gid *ib_addr_get_dgid(struct rdma_dev_addr *dev_addr) +{ + return (union ib_gid *) (dev_addr->dst_dev_addr + 4); +} + +static inline void ib_addr_set_dgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(dev_addr->dst_dev_addr + 4, gid, sizeof *gid); +} + +#endif /* IB_ADDR_H */ -- cgit 1.2.3-korg