acm.c revision 331769
1/*
2 * Copyright (c) 2010-2012 Intel Corporation.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <config.h>
34
35#include <stdio.h>
36#include <inttypes.h>
37#include <sys/types.h>
38#include <sys/socket.h>
39#include <netdb.h>
40#include <unistd.h>
41
42#include "cma.h"
43#include <rdma/rdma_cma.h>
44#include <infiniband/ib.h>
45#include <infiniband/sa.h>
46
47#define ACM_VERSION             1
48
49#define ACM_OP_RESOLVE          0x01
50#define ACM_OP_ACK              0x80
51
52#define ACM_STATUS_SUCCESS      0
53#define ACM_STATUS_ENOMEM       1
54#define ACM_STATUS_EINVAL       2
55#define ACM_STATUS_ENODATA      3
56#define ACM_STATUS_ENOTCONN     5
57#define ACM_STATUS_ETIMEDOUT    6
58#define ACM_STATUS_ESRCADDR     7
59#define ACM_STATUS_ESRCTYPE     8
60#define ACM_STATUS_EDESTADDR    9
61#define ACM_STATUS_EDESTTYPE    10
62
63#define ACM_FLAGS_NODELAY	(1<<30)
64
65#define ACM_MSG_HDR_LENGTH      16
66#define ACM_MAX_ADDRESS         64
67#define ACM_MSG_EP_LENGTH       72
68#define ACM_MSG_DATA_LENGTH     (ACM_MSG_EP_LENGTH * 8)
69
70struct acm_hdr {
71	uint8_t                 version;
72	uint8_t                 opcode;
73	uint8_t                 status;
74	uint8_t		        data[3];
75	uint16_t                length;
76	uint64_t                tid;
77};
78
79#define ACM_EP_INFO_NAME        0x0001
80#define ACM_EP_INFO_ADDRESS_IP  0x0002
81#define ACM_EP_INFO_ADDRESS_IP6 0x0003
82#define ACM_EP_INFO_PATH        0x0010
83
84union acm_ep_info {
85	uint8_t                 addr[ACM_MAX_ADDRESS];
86	uint8_t                 name[ACM_MAX_ADDRESS];
87	struct ibv_path_record  path;
88};
89
90#define ACM_EP_FLAG_SOURCE      (1<<0)
91#define ACM_EP_FLAG_DEST        (1<<1)
92
93struct acm_ep_addr_data {
94	uint32_t                flags;
95	uint16_t                type;
96	uint16_t                reserved;
97	union acm_ep_info       info;
98};
99
100struct acm_resolve_msg {
101	struct acm_hdr          hdr;
102	struct acm_ep_addr_data data[0];
103};
104
105struct acm_msg {
106	struct acm_hdr                  hdr;
107	union{
108		uint8_t                 data[ACM_MSG_DATA_LENGTH];
109		struct acm_ep_addr_data resolve_data[0];
110	};
111};
112
113static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER;
114static int sock = -1;
115static uint16_t server_port;
116
117static int ucma_set_server_port(void)
118{
119	FILE *f;
120
121	if ((f = fopen(IBACM_PORT_FILE, "r" STREAM_CLOEXEC))) {
122		if (fscanf(f, "%" SCNu16, &server_port) != 1)
123			server_port = 0;
124		fclose(f);
125	}
126	return server_port;
127}
128
129void ucma_ib_init(void)
130{
131	struct sockaddr_in addr;
132	static int init;
133	int ret;
134
135	if (init)
136		return;
137
138	pthread_mutex_lock(&acm_lock);
139	if (init)
140		goto unlock;
141
142	if (!ucma_set_server_port())
143		goto out;
144
145	sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
146	if (sock < 0)
147		goto out;
148
149	memset(&addr, 0, sizeof addr);
150	addr.sin_family = AF_INET;
151	addr.sin_addr.s_addr = htobe32(INADDR_LOOPBACK);
152	addr.sin_port = htobe16(server_port);
153	ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr));
154	if (ret) {
155		close(sock);
156		sock = -1;
157	}
158out:
159	init = 1;
160unlock:
161	pthread_mutex_unlock(&acm_lock);
162}
163
164void ucma_ib_cleanup(void)
165{
166	if (sock >= 0) {
167		shutdown(sock, SHUT_RDWR);
168		close(sock);
169	}
170}
171
172static int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai,
173			    struct rdma_addrinfo *rai)
174{
175	struct sockaddr_ib *src, *dst;
176	struct ibv_path_record *path;
177
178	src = calloc(1, sizeof(*src));
179	if (!src)
180		return ERR(ENOMEM);
181
182	dst = calloc(1, sizeof(*dst));
183	if (!dst) {
184		free(src);
185		return ERR(ENOMEM);
186	}
187
188	path = &((struct ibv_path_data *) ib_rai->ai_route)->path;
189
190	src->sib_family = AF_IB;
191	src->sib_pkey = path->pkey;
192	src->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8);
193	memcpy(&src->sib_addr, &path->sgid, 16);
194	ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src);
195
196	dst->sib_family = AF_IB;
197	dst->sib_pkey = path->pkey;
198	dst->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8);
199	memcpy(&dst->sib_addr, &path->dgid, 16);
200	ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst);
201
202	ib_rai->ai_src_addr = (struct sockaddr *) src;
203	ib_rai->ai_src_len = sizeof(*src);
204
205	ib_rai->ai_dst_addr = (struct sockaddr *) dst;
206	ib_rai->ai_dst_len = sizeof(*dst);
207
208	return 0;
209}
210
211static int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai,
212			       struct rdma_addrinfo *rai)
213{
214	struct ib_connect_hdr *hdr;
215
216	if (rai->ai_family == AF_IB)
217		return 0;
218
219	hdr = calloc(1, sizeof(*hdr));
220	if (!hdr)
221		return ERR(ENOMEM);
222
223	if (rai->ai_family == AF_INET) {
224		hdr->ip_version = 4 << 4;
225		memcpy(&hdr->cma_src_ip4,
226		       &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4);
227		memcpy(&hdr->cma_dst_ip4,
228		       &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4);
229	} else {
230		hdr->ip_version = 6 << 4;
231		memcpy(&hdr->cma_src_ip6,
232		       &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16);
233		memcpy(&hdr->cma_dst_ip6,
234		       &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16);
235	}
236
237	ib_rai->ai_connect = hdr;
238	ib_rai->ai_connect_len = sizeof(*hdr);
239	return 0;
240}
241
242static void ucma_resolve_af_ib(struct rdma_addrinfo **rai)
243{
244	struct rdma_addrinfo *ib_rai;
245
246	ib_rai = calloc(1, sizeof(*ib_rai));
247	if (!ib_rai)
248		return;
249
250	ib_rai->ai_flags = (*rai)->ai_flags;
251	ib_rai->ai_family = AF_IB;
252	ib_rai->ai_qp_type = (*rai)->ai_qp_type;
253	ib_rai->ai_port_space = (*rai)->ai_port_space;
254
255	ib_rai->ai_route = calloc(1, (*rai)->ai_route_len);
256	if (!ib_rai->ai_route)
257		goto err;
258
259	memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len);
260	ib_rai->ai_route_len = (*rai)->ai_route_len;
261
262	if ((*rai)->ai_src_canonname) {
263		ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname);
264		if (!ib_rai->ai_src_canonname)
265			goto err;
266	}
267
268	if ((*rai)->ai_dst_canonname) {
269		ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname);
270		if (!ib_rai->ai_dst_canonname)
271			goto err;
272	}
273
274	if (ucma_ib_set_connect(ib_rai, *rai))
275		goto err;
276
277	if (ucma_ib_set_addr(ib_rai, *rai))
278		goto err;
279
280	ib_rai->ai_next = *rai;
281	*rai = ib_rai;
282	return;
283
284err:
285	rdma_freeaddrinfo(ib_rai);
286}
287
288static void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg)
289{
290	struct acm_ep_addr_data *ep_data;
291	struct ibv_path_data *path_data = NULL;
292	struct sockaddr_in *sin;
293	struct sockaddr_in6 *sin6;
294	int i, cnt, path_cnt = 0;
295
296	cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH;
297	for (i = 0; i < cnt; i++) {
298		ep_data = &msg->resolve_data[i];
299		switch (ep_data->type) {
300		case ACM_EP_INFO_PATH:
301			ep_data->type = 0;
302			if (!path_data)
303				path_data = (struct ibv_path_data *) ep_data;
304			path_cnt++;
305			break;
306		case ACM_EP_INFO_ADDRESS_IP:
307			if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
308				break;
309
310			sin = calloc(1, sizeof(*sin));
311			if (!sin)
312				break;
313
314			sin->sin_family = AF_INET;
315			memcpy(&sin->sin_addr, &ep_data->info.addr, 4);
316			rai->ai_src_len = sizeof(*sin);
317			rai->ai_src_addr = (struct sockaddr *) sin;
318			break;
319		case ACM_EP_INFO_ADDRESS_IP6:
320			if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
321				break;
322
323			sin6 = calloc(1, sizeof(*sin6));
324			if (!sin6)
325				break;
326
327			sin6->sin6_family = AF_INET6;
328			memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16);
329			rai->ai_src_len = sizeof(*sin6);
330			rai->ai_src_addr = (struct sockaddr *) sin6;
331			break;
332		default:
333			break;
334		}
335	}
336
337	rai->ai_route = calloc(path_cnt, sizeof(*path_data));
338	if (rai->ai_route) {
339		memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data));
340		rai->ai_route_len = path_cnt * sizeof(*path_data);
341	}
342}
343
344static void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr)
345{
346	if (addr->sa_family == AF_INET) {
347		data->type = ACM_EP_INFO_ADDRESS_IP;
348		memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4);
349	} else {
350		data->type = ACM_EP_INFO_ADDRESS_IP6;
351		memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16);
352	}
353}
354
355static int ucma_inet_addr(struct sockaddr *addr, socklen_t len)
356{
357	return len && addr && (addr->sa_family == AF_INET ||
358			       addr->sa_family == AF_INET6);
359}
360
361static int ucma_ib_addr(struct sockaddr *addr, socklen_t len)
362{
363	return len && addr && (addr->sa_family == AF_IB);
364}
365
366void ucma_ib_resolve(struct rdma_addrinfo **rai,
367		     const struct rdma_addrinfo *hints)
368{
369	struct acm_msg msg;
370	struct acm_ep_addr_data *data;
371	int ret;
372
373	ucma_ib_init();
374	if (sock < 0)
375		return;
376
377	memset(&msg, 0, sizeof msg);
378	msg.hdr.version = ACM_VERSION;
379	msg.hdr.opcode = ACM_OP_RESOLVE;
380	msg.hdr.length = ACM_MSG_HDR_LENGTH;
381
382	data = &msg.resolve_data[0];
383	if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
384		data->flags = ACM_EP_FLAG_SOURCE;
385		ucma_set_ep_addr(data, (*rai)->ai_src_addr);
386		data++;
387		msg.hdr.length += ACM_MSG_EP_LENGTH;
388	}
389
390	if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
391		data->flags = ACM_EP_FLAG_DEST;
392		if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE))
393			data->flags |= ACM_FLAGS_NODELAY;
394		ucma_set_ep_addr(data, (*rai)->ai_dst_addr);
395		data++;
396		msg.hdr.length += ACM_MSG_EP_LENGTH;
397	}
398
399	if (hints->ai_route_len ||
400	    ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) ||
401	    ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
402		struct ibv_path_record *path;
403
404		if (hints->ai_route_len == sizeof(struct ibv_path_record))
405			path = (struct ibv_path_record *) hints->ai_route;
406		else if (hints->ai_route_len == sizeof(struct ibv_path_data))
407			path = &((struct ibv_path_data *) hints->ai_route)->path;
408		else
409			path = NULL;
410
411		if (path)
412			memcpy(&data->info.path, path, sizeof(*path));
413
414		if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
415			memcpy(&data->info.path.sgid,
416			       &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16);
417		}
418		if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
419			memcpy(&data->info.path.dgid,
420			       &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16);
421		}
422		data->type = ACM_EP_INFO_PATH;
423		data++;
424		msg.hdr.length += ACM_MSG_EP_LENGTH;
425	}
426
427	pthread_mutex_lock(&acm_lock);
428	ret = send(sock, (char *) &msg, msg.hdr.length, 0);
429	if (ret != msg.hdr.length) {
430		pthread_mutex_unlock(&acm_lock);
431		return;
432	}
433
434	ret = recv(sock, (char *) &msg, sizeof msg, 0);
435	pthread_mutex_unlock(&acm_lock);
436	if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status)
437		return;
438
439	ucma_ib_save_resp(*rai, &msg);
440
441	if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len)
442		ucma_resolve_af_ib(rai);
443}
444