1321936Shselasky/*
2321936Shselasky * Copyright (c) 2010-2012 Intel Corporation.  All rights reserved.
3321936Shselasky *
4321936Shselasky * This software is available to you under a choice of one of two
5321936Shselasky * licenses.  You may choose to be licensed under the terms of the GNU
6321936Shselasky * General Public License (GPL) Version 2, available from the file
7321936Shselasky * COPYING in the main directory of this source tree, or the
8321936Shselasky * OpenIB.org BSD license below:
9321936Shselasky *
10321936Shselasky *     Redistribution and use in source and binary forms, with or
11321936Shselasky *     without modification, are permitted provided that the following
12321936Shselasky *     conditions are met:
13321936Shselasky *
14321936Shselasky *      - Redistributions of source code must retain the above
15321936Shselasky *        copyright notice, this list of conditions and the following
16321936Shselasky *        disclaimer.
17321936Shselasky *
18321936Shselasky *      - Redistributions in binary form must reproduce the above
19321936Shselasky *        copyright notice, this list of conditions and the following
20321936Shselasky *        disclaimer in the documentation and/or other materials
21321936Shselasky *        provided with the distribution.
22321936Shselasky *
23321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30321936Shselasky * SOFTWARE.
31321936Shselasky */
32321936Shselasky
33321936Shselasky#include <config.h>
34321936Shselasky
35321936Shselasky#include <stdio.h>
36321936Shselasky#include <inttypes.h>
37321936Shselasky#include <sys/types.h>
38321936Shselasky#include <sys/socket.h>
39321936Shselasky#include <netdb.h>
40321936Shselasky#include <unistd.h>
41321936Shselasky
42321936Shselasky#include "cma.h"
43321936Shselasky#include <rdma/rdma_cma.h>
44321936Shselasky#include <infiniband/ib.h>
45321936Shselasky#include <infiniband/sa.h>
46321936Shselasky
47321936Shselasky#define ACM_VERSION             1
48321936Shselasky
49321936Shselasky#define ACM_OP_RESOLVE          0x01
50321936Shselasky#define ACM_OP_ACK              0x80
51321936Shselasky
52321936Shselasky#define ACM_STATUS_SUCCESS      0
53321936Shselasky#define ACM_STATUS_ENOMEM       1
54321936Shselasky#define ACM_STATUS_EINVAL       2
55321936Shselasky#define ACM_STATUS_ENODATA      3
56321936Shselasky#define ACM_STATUS_ENOTCONN     5
57321936Shselasky#define ACM_STATUS_ETIMEDOUT    6
58321936Shselasky#define ACM_STATUS_ESRCADDR     7
59321936Shselasky#define ACM_STATUS_ESRCTYPE     8
60321936Shselasky#define ACM_STATUS_EDESTADDR    9
61321936Shselasky#define ACM_STATUS_EDESTTYPE    10
62321936Shselasky
63321936Shselasky#define ACM_FLAGS_NODELAY	(1<<30)
64321936Shselasky
65321936Shselasky#define ACM_MSG_HDR_LENGTH      16
66321936Shselasky#define ACM_MAX_ADDRESS         64
67321936Shselasky#define ACM_MSG_EP_LENGTH       72
68321936Shselasky#define ACM_MSG_DATA_LENGTH     (ACM_MSG_EP_LENGTH * 8)
69321936Shselasky
70321936Shselaskystruct acm_hdr {
71321936Shselasky	uint8_t                 version;
72321936Shselasky	uint8_t                 opcode;
73321936Shselasky	uint8_t                 status;
74321936Shselasky	uint8_t		        data[3];
75321936Shselasky	uint16_t                length;
76321936Shselasky	uint64_t                tid;
77321936Shselasky};
78321936Shselasky
79321936Shselasky#define ACM_EP_INFO_NAME        0x0001
80321936Shselasky#define ACM_EP_INFO_ADDRESS_IP  0x0002
81321936Shselasky#define ACM_EP_INFO_ADDRESS_IP6 0x0003
82321936Shselasky#define ACM_EP_INFO_PATH        0x0010
83321936Shselasky
84321936Shselaskyunion acm_ep_info {
85321936Shselasky	uint8_t                 addr[ACM_MAX_ADDRESS];
86321936Shselasky	uint8_t                 name[ACM_MAX_ADDRESS];
87321936Shselasky	struct ibv_path_record  path;
88321936Shselasky};
89321936Shselasky
90321936Shselasky#define ACM_EP_FLAG_SOURCE      (1<<0)
91321936Shselasky#define ACM_EP_FLAG_DEST        (1<<1)
92321936Shselasky
93321936Shselaskystruct acm_ep_addr_data {
94321936Shselasky	uint32_t                flags;
95321936Shselasky	uint16_t                type;
96321936Shselasky	uint16_t                reserved;
97321936Shselasky	union acm_ep_info       info;
98321936Shselasky};
99321936Shselasky
100321936Shselaskystruct acm_resolve_msg {
101321936Shselasky	struct acm_hdr          hdr;
102321936Shselasky	struct acm_ep_addr_data data[0];
103321936Shselasky};
104321936Shselasky
105321936Shselaskystruct acm_msg {
106321936Shselasky	struct acm_hdr                  hdr;
107321936Shselasky	union{
108321936Shselasky		uint8_t                 data[ACM_MSG_DATA_LENGTH];
109321936Shselasky		struct acm_ep_addr_data resolve_data[0];
110321936Shselasky	};
111321936Shselasky};
112321936Shselasky
113321936Shselaskystatic pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER;
114321936Shselaskystatic int sock = -1;
115321936Shselaskystatic uint16_t server_port;
116321936Shselasky
117321936Shselaskystatic int ucma_set_server_port(void)
118321936Shselasky{
119321936Shselasky	FILE *f;
120321936Shselasky
121321936Shselasky	if ((f = fopen(IBACM_PORT_FILE, "r" STREAM_CLOEXEC))) {
122321936Shselasky		if (fscanf(f, "%" SCNu16, &server_port) != 1)
123321936Shselasky			server_port = 0;
124321936Shselasky		fclose(f);
125321936Shselasky	}
126321936Shselasky	return server_port;
127321936Shselasky}
128321936Shselasky
129321936Shselaskyvoid ucma_ib_init(void)
130321936Shselasky{
131321936Shselasky	struct sockaddr_in addr;
132321936Shselasky	static int init;
133321936Shselasky	int ret;
134321936Shselasky
135321936Shselasky	if (init)
136321936Shselasky		return;
137321936Shselasky
138321936Shselasky	pthread_mutex_lock(&acm_lock);
139321936Shselasky	if (init)
140321936Shselasky		goto unlock;
141321936Shselasky
142321936Shselasky	if (!ucma_set_server_port())
143321936Shselasky		goto out;
144321936Shselasky
145321936Shselasky	sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
146321936Shselasky	if (sock < 0)
147321936Shselasky		goto out;
148321936Shselasky
149321936Shselasky	memset(&addr, 0, sizeof addr);
150321936Shselasky	addr.sin_family = AF_INET;
151321936Shselasky	addr.sin_addr.s_addr = htobe32(INADDR_LOOPBACK);
152321936Shselasky	addr.sin_port = htobe16(server_port);
153321936Shselasky	ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr));
154321936Shselasky	if (ret) {
155321936Shselasky		close(sock);
156321936Shselasky		sock = -1;
157321936Shselasky	}
158321936Shselaskyout:
159321936Shselasky	init = 1;
160321936Shselaskyunlock:
161321936Shselasky	pthread_mutex_unlock(&acm_lock);
162321936Shselasky}
163321936Shselasky
164321936Shselaskyvoid ucma_ib_cleanup(void)
165321936Shselasky{
166321936Shselasky	if (sock >= 0) {
167321936Shselasky		shutdown(sock, SHUT_RDWR);
168321936Shselasky		close(sock);
169321936Shselasky	}
170321936Shselasky}
171321936Shselasky
172321936Shselaskystatic int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai,
173321936Shselasky			    struct rdma_addrinfo *rai)
174321936Shselasky{
175321936Shselasky	struct sockaddr_ib *src, *dst;
176321936Shselasky	struct ibv_path_record *path;
177321936Shselasky
178321936Shselasky	src = calloc(1, sizeof(*src));
179321936Shselasky	if (!src)
180321936Shselasky		return ERR(ENOMEM);
181321936Shselasky
182321936Shselasky	dst = calloc(1, sizeof(*dst));
183321936Shselasky	if (!dst) {
184321936Shselasky		free(src);
185321936Shselasky		return ERR(ENOMEM);
186321936Shselasky	}
187321936Shselasky
188321936Shselasky	path = &((struct ibv_path_data *) ib_rai->ai_route)->path;
189321936Shselasky
190321936Shselasky	src->sib_family = AF_IB;
191321936Shselasky	src->sib_pkey = path->pkey;
192321936Shselasky	src->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8);
193321936Shselasky	memcpy(&src->sib_addr, &path->sgid, 16);
194321936Shselasky	ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src);
195321936Shselasky
196321936Shselasky	dst->sib_family = AF_IB;
197321936Shselasky	dst->sib_pkey = path->pkey;
198321936Shselasky	dst->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8);
199321936Shselasky	memcpy(&dst->sib_addr, &path->dgid, 16);
200321936Shselasky	ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst);
201321936Shselasky
202321936Shselasky	ib_rai->ai_src_addr = (struct sockaddr *) src;
203321936Shselasky	ib_rai->ai_src_len = sizeof(*src);
204321936Shselasky
205321936Shselasky	ib_rai->ai_dst_addr = (struct sockaddr *) dst;
206321936Shselasky	ib_rai->ai_dst_len = sizeof(*dst);
207321936Shselasky
208321936Shselasky	return 0;
209321936Shselasky}
210321936Shselasky
211321936Shselaskystatic int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai,
212321936Shselasky			       struct rdma_addrinfo *rai)
213321936Shselasky{
214321936Shselasky	struct ib_connect_hdr *hdr;
215321936Shselasky
216321936Shselasky	if (rai->ai_family == AF_IB)
217321936Shselasky		return 0;
218321936Shselasky
219321936Shselasky	hdr = calloc(1, sizeof(*hdr));
220321936Shselasky	if (!hdr)
221321936Shselasky		return ERR(ENOMEM);
222321936Shselasky
223321936Shselasky	if (rai->ai_family == AF_INET) {
224321936Shselasky		hdr->ip_version = 4 << 4;
225321936Shselasky		memcpy(&hdr->cma_src_ip4,
226321936Shselasky		       &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4);
227321936Shselasky		memcpy(&hdr->cma_dst_ip4,
228321936Shselasky		       &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4);
229321936Shselasky	} else {
230321936Shselasky		hdr->ip_version = 6 << 4;
231321936Shselasky		memcpy(&hdr->cma_src_ip6,
232321936Shselasky		       &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16);
233321936Shselasky		memcpy(&hdr->cma_dst_ip6,
234321936Shselasky		       &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16);
235321936Shselasky	}
236321936Shselasky
237321936Shselasky	ib_rai->ai_connect = hdr;
238321936Shselasky	ib_rai->ai_connect_len = sizeof(*hdr);
239321936Shselasky	return 0;
240321936Shselasky}
241321936Shselasky
242321936Shselaskystatic void ucma_resolve_af_ib(struct rdma_addrinfo **rai)
243321936Shselasky{
244321936Shselasky	struct rdma_addrinfo *ib_rai;
245321936Shselasky
246321936Shselasky	ib_rai = calloc(1, sizeof(*ib_rai));
247321936Shselasky	if (!ib_rai)
248321936Shselasky		return;
249321936Shselasky
250321936Shselasky	ib_rai->ai_flags = (*rai)->ai_flags;
251321936Shselasky	ib_rai->ai_family = AF_IB;
252321936Shselasky	ib_rai->ai_qp_type = (*rai)->ai_qp_type;
253321936Shselasky	ib_rai->ai_port_space = (*rai)->ai_port_space;
254321936Shselasky
255321936Shselasky	ib_rai->ai_route = calloc(1, (*rai)->ai_route_len);
256321936Shselasky	if (!ib_rai->ai_route)
257321936Shselasky		goto err;
258321936Shselasky
259321936Shselasky	memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len);
260321936Shselasky	ib_rai->ai_route_len = (*rai)->ai_route_len;
261321936Shselasky
262321936Shselasky	if ((*rai)->ai_src_canonname) {
263321936Shselasky		ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname);
264321936Shselasky		if (!ib_rai->ai_src_canonname)
265321936Shselasky			goto err;
266321936Shselasky	}
267321936Shselasky
268321936Shselasky	if ((*rai)->ai_dst_canonname) {
269321936Shselasky		ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname);
270321936Shselasky		if (!ib_rai->ai_dst_canonname)
271321936Shselasky			goto err;
272321936Shselasky	}
273321936Shselasky
274321936Shselasky	if (ucma_ib_set_connect(ib_rai, *rai))
275321936Shselasky		goto err;
276321936Shselasky
277321936Shselasky	if (ucma_ib_set_addr(ib_rai, *rai))
278321936Shselasky		goto err;
279321936Shselasky
280321936Shselasky	ib_rai->ai_next = *rai;
281321936Shselasky	*rai = ib_rai;
282321936Shselasky	return;
283321936Shselasky
284321936Shselaskyerr:
285321936Shselasky	rdma_freeaddrinfo(ib_rai);
286321936Shselasky}
287321936Shselasky
288321936Shselaskystatic void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg)
289321936Shselasky{
290321936Shselasky	struct acm_ep_addr_data *ep_data;
291321936Shselasky	struct ibv_path_data *path_data = NULL;
292321936Shselasky	struct sockaddr_in *sin;
293321936Shselasky	struct sockaddr_in6 *sin6;
294321936Shselasky	int i, cnt, path_cnt = 0;
295321936Shselasky
296321936Shselasky	cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH;
297321936Shselasky	for (i = 0; i < cnt; i++) {
298321936Shselasky		ep_data = &msg->resolve_data[i];
299321936Shselasky		switch (ep_data->type) {
300321936Shselasky		case ACM_EP_INFO_PATH:
301321936Shselasky			ep_data->type = 0;
302321936Shselasky			if (!path_data)
303321936Shselasky				path_data = (struct ibv_path_data *) ep_data;
304321936Shselasky			path_cnt++;
305321936Shselasky			break;
306321936Shselasky		case ACM_EP_INFO_ADDRESS_IP:
307321936Shselasky			if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
308321936Shselasky				break;
309321936Shselasky
310321936Shselasky			sin = calloc(1, sizeof(*sin));
311321936Shselasky			if (!sin)
312321936Shselasky				break;
313321936Shselasky
314321936Shselasky			sin->sin_family = AF_INET;
315321936Shselasky			memcpy(&sin->sin_addr, &ep_data->info.addr, 4);
316321936Shselasky			rai->ai_src_len = sizeof(*sin);
317321936Shselasky			rai->ai_src_addr = (struct sockaddr *) sin;
318321936Shselasky			break;
319321936Shselasky		case ACM_EP_INFO_ADDRESS_IP6:
320321936Shselasky			if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
321321936Shselasky				break;
322321936Shselasky
323321936Shselasky			sin6 = calloc(1, sizeof(*sin6));
324321936Shselasky			if (!sin6)
325321936Shselasky				break;
326321936Shselasky
327321936Shselasky			sin6->sin6_family = AF_INET6;
328321936Shselasky			memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16);
329321936Shselasky			rai->ai_src_len = sizeof(*sin6);
330321936Shselasky			rai->ai_src_addr = (struct sockaddr *) sin6;
331321936Shselasky			break;
332321936Shselasky		default:
333321936Shselasky			break;
334321936Shselasky		}
335321936Shselasky	}
336321936Shselasky
337321936Shselasky	rai->ai_route = calloc(path_cnt, sizeof(*path_data));
338321936Shselasky	if (rai->ai_route) {
339321936Shselasky		memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data));
340321936Shselasky		rai->ai_route_len = path_cnt * sizeof(*path_data);
341321936Shselasky	}
342321936Shselasky}
343321936Shselasky
344321936Shselaskystatic void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr)
345321936Shselasky{
346321936Shselasky	if (addr->sa_family == AF_INET) {
347321936Shselasky		data->type = ACM_EP_INFO_ADDRESS_IP;
348321936Shselasky		memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4);
349321936Shselasky	} else {
350321936Shselasky		data->type = ACM_EP_INFO_ADDRESS_IP6;
351321936Shselasky		memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16);
352321936Shselasky	}
353321936Shselasky}
354321936Shselasky
355321936Shselaskystatic int ucma_inet_addr(struct sockaddr *addr, socklen_t len)
356321936Shselasky{
357321936Shselasky	return len && addr && (addr->sa_family == AF_INET ||
358321936Shselasky			       addr->sa_family == AF_INET6);
359321936Shselasky}
360321936Shselasky
361321936Shselaskystatic int ucma_ib_addr(struct sockaddr *addr, socklen_t len)
362321936Shselasky{
363321936Shselasky	return len && addr && (addr->sa_family == AF_IB);
364321936Shselasky}
365321936Shselasky
366321936Shselaskyvoid ucma_ib_resolve(struct rdma_addrinfo **rai,
367321936Shselasky		     const struct rdma_addrinfo *hints)
368321936Shselasky{
369321936Shselasky	struct acm_msg msg;
370321936Shselasky	struct acm_ep_addr_data *data;
371321936Shselasky	int ret;
372321936Shselasky
373321936Shselasky	ucma_ib_init();
374321936Shselasky	if (sock < 0)
375321936Shselasky		return;
376321936Shselasky
377321936Shselasky	memset(&msg, 0, sizeof msg);
378321936Shselasky	msg.hdr.version = ACM_VERSION;
379321936Shselasky	msg.hdr.opcode = ACM_OP_RESOLVE;
380321936Shselasky	msg.hdr.length = ACM_MSG_HDR_LENGTH;
381321936Shselasky
382321936Shselasky	data = &msg.resolve_data[0];
383321936Shselasky	if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
384321936Shselasky		data->flags = ACM_EP_FLAG_SOURCE;
385321936Shselasky		ucma_set_ep_addr(data, (*rai)->ai_src_addr);
386321936Shselasky		data++;
387321936Shselasky		msg.hdr.length += ACM_MSG_EP_LENGTH;
388321936Shselasky	}
389321936Shselasky
390321936Shselasky	if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
391321936Shselasky		data->flags = ACM_EP_FLAG_DEST;
392321936Shselasky		if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE))
393321936Shselasky			data->flags |= ACM_FLAGS_NODELAY;
394321936Shselasky		ucma_set_ep_addr(data, (*rai)->ai_dst_addr);
395321936Shselasky		data++;
396321936Shselasky		msg.hdr.length += ACM_MSG_EP_LENGTH;
397321936Shselasky	}
398321936Shselasky
399321936Shselasky	if (hints->ai_route_len ||
400321936Shselasky	    ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) ||
401321936Shselasky	    ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
402321936Shselasky		struct ibv_path_record *path;
403321936Shselasky
404321936Shselasky		if (hints->ai_route_len == sizeof(struct ibv_path_record))
405321936Shselasky			path = (struct ibv_path_record *) hints->ai_route;
406321936Shselasky		else if (hints->ai_route_len == sizeof(struct ibv_path_data))
407321936Shselasky			path = &((struct ibv_path_data *) hints->ai_route)->path;
408321936Shselasky		else
409321936Shselasky			path = NULL;
410321936Shselasky
411321936Shselasky		if (path)
412321936Shselasky			memcpy(&data->info.path, path, sizeof(*path));
413321936Shselasky
414321936Shselasky		if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
415321936Shselasky			memcpy(&data->info.path.sgid,
416321936Shselasky			       &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16);
417321936Shselasky		}
418321936Shselasky		if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
419321936Shselasky			memcpy(&data->info.path.dgid,
420321936Shselasky			       &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16);
421321936Shselasky		}
422321936Shselasky		data->type = ACM_EP_INFO_PATH;
423321936Shselasky		data++;
424321936Shselasky		msg.hdr.length += ACM_MSG_EP_LENGTH;
425321936Shselasky	}
426321936Shselasky
427321936Shselasky	pthread_mutex_lock(&acm_lock);
428321936Shselasky	ret = send(sock, (char *) &msg, msg.hdr.length, 0);
429321936Shselasky	if (ret != msg.hdr.length) {
430321936Shselasky		pthread_mutex_unlock(&acm_lock);
431321936Shselasky		return;
432321936Shselasky	}
433321936Shselasky
434321936Shselasky	ret = recv(sock, (char *) &msg, sizeof msg, 0);
435321936Shselasky	pthread_mutex_unlock(&acm_lock);
436321936Shselasky	if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status)
437321936Shselasky		return;
438321936Shselasky
439321936Shselasky	ucma_ib_save_resp(*rai, &msg);
440321936Shselasky
441321936Shselasky	if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len)
442321936Shselasky		ucma_resolve_af_ib(rai);
443321936Shselasky}
444