cma.c revision 331769
1/*
2 * Copyright (c) 2005-2014 Intel Corporation.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <config.h>
34
35#include <stdlib.h>
36#include <string.h>
37#include <glob.h>
38#include <stdio.h>
39#include <fcntl.h>
40#include <errno.h>
41#include <stdint.h>
42#include <poll.h>
43#include <unistd.h>
44#include <pthread.h>
45#include <infiniband/endian.h>
46#include <stddef.h>
47#include <netdb.h>
48#include <syslog.h>
49#include <limits.h>
50
51#include "cma.h"
52#include "indexer.h"
53#include <infiniband/driver.h>
54#include <infiniband/marshall.h>
55#include <rdma/rdma_cma.h>
56#include <rdma/rdma_cma_abi.h>
57#include <rdma/rdma_verbs.h>
58#include <infiniband/ib.h>
59
60#define CMA_INIT_CMD(req, req_size, op)		\
61do {						\
62	memset(req, 0, req_size);		\
63	(req)->cmd = UCMA_CMD_##op;		\
64	(req)->in  = req_size - sizeof(struct ucma_abi_cmd_hdr); \
65} while (0)
66
67#define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \
68do {						\
69	CMA_INIT_CMD(req, req_size, op);	\
70	(req)->out = resp_size;			\
71	(req)->response = (uintptr_t) (resp);	\
72} while (0)
73
74struct cma_port {
75	uint8_t			link_layer;
76};
77
78struct cma_device {
79	struct ibv_context *verbs;
80	struct ibv_pd	   *pd;
81	struct ibv_xrcd    *xrcd;
82	struct cma_port    *port;
83	__be64		    guid;
84	int		    port_cnt;
85	int		    refcnt;
86	int		    max_qpsize;
87	uint8_t		    max_initiator_depth;
88	uint8_t		    max_responder_resources;
89};
90
91struct cma_id_private {
92	struct rdma_cm_id	id;
93	struct cma_device	*cma_dev;
94	void			*connect;
95	size_t			connect_len;
96	int			events_completed;
97	int			connect_error;
98	int			sync;
99	pthread_cond_t		cond;
100	pthread_mutex_t		mut;
101	uint32_t		handle;
102	struct cma_multicast	*mc_list;
103	struct ibv_qp_init_attr	*qp_init_attr;
104	uint8_t			initiator_depth;
105	uint8_t			responder_resources;
106};
107
108struct cma_multicast {
109	struct cma_multicast  *next;
110	struct cma_id_private *id_priv;
111	void		*context;
112	int		events_completed;
113	pthread_cond_t	cond;
114	uint32_t	handle;
115	union ibv_gid	mgid;
116	uint16_t	mlid;
117	struct sockaddr_storage addr;
118};
119
120struct cma_event {
121	struct rdma_cm_event	event;
122	uint8_t			private_data[RDMA_MAX_PRIVATE_DATA];
123	struct cma_id_private	*id_priv;
124	struct cma_multicast	*mc;
125};
126
127static struct cma_device *cma_dev_array;
128static int cma_dev_cnt;
129static int cma_init_cnt;
130static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
131static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
132int af_ib_support;
133static struct index_map ucma_idm;
134static fastlock_t idm_lock;
135
136static int check_abi_version(void)
137{
138	char value[8];
139
140	if ((ibv_read_sysfs_file(ibv_get_sysfs_path(),
141				 "class/misc/rdma_cm/abi_version",
142				 value, sizeof value) < 0) &&
143	    (ibv_read_sysfs_file(ibv_get_sysfs_path(),
144				 "class/infiniband_ucma/abi_version",
145				 value, sizeof value) < 0)) {
146		/*
147		 * Older version of Linux do not have class/misc.  To support
148		 * backports, assume the most recent version of the ABI.  If
149		 * we're wrong, we'll simply fail later when calling the ABI.
150		 */
151		return 0;
152	}
153
154	abi_ver = strtol(value, NULL, 10);
155	if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION ||
156	    abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) {
157		return -1;
158	}
159	return 0;
160}
161
162/*
163 * This function is called holding the mutex lock
164 * cma_dev_cnt must be set before calling this function to
165 * ensure that the lock is not acquired recursively.
166 */
167static void ucma_set_af_ib_support(void)
168{
169	struct rdma_cm_id *id;
170	struct sockaddr_ib sib;
171	int ret;
172
173	ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB);
174	if (ret)
175		return;
176
177	memset(&sib, 0, sizeof sib);
178	sib.sib_family = AF_IB;
179	sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP);
180	sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK);
181	af_ib_support = 1;
182	ret = rdma_bind_addr(id, (struct sockaddr *) &sib);
183	af_ib_support = !ret;
184
185	rdma_destroy_id(id);
186}
187
188int ucma_init(void)
189{
190	struct ibv_device **dev_list = NULL;
191	int i, ret, dev_cnt;
192
193	/* Quick check without lock to see if we're already initialized */
194	if (cma_dev_cnt)
195		return 0;
196
197	pthread_mutex_lock(&mut);
198	if (cma_dev_cnt) {
199		pthread_mutex_unlock(&mut);
200		return 0;
201	}
202
203	fastlock_init(&idm_lock);
204	ret = check_abi_version();
205	if (ret)
206		goto err1;
207
208	dev_list = ibv_get_device_list(&dev_cnt);
209	if (!dev_list) {
210		ret = ERR(ENODEV);
211		goto err1;
212	}
213
214	if (!dev_cnt) {
215		ret = ERR(ENODEV);
216		goto err2;
217	}
218
219	cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array));
220	if (!cma_dev_array) {
221		ret = ERR(ENOMEM);
222		goto err2;
223	}
224
225	for (i = 0; dev_list[i]; i++)
226		cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]);
227
228	cma_dev_cnt = dev_cnt;
229	ucma_set_af_ib_support();
230	pthread_mutex_unlock(&mut);
231	ibv_free_device_list(dev_list);
232	return 0;
233
234err2:
235	ibv_free_device_list(dev_list);
236err1:
237	fastlock_destroy(&idm_lock);
238	pthread_mutex_unlock(&mut);
239	return ret;
240}
241
242static struct ibv_context *ucma_open_device(__be64 guid)
243{
244	struct ibv_device **dev_list;
245	struct ibv_context *verbs = NULL;
246	int i;
247
248	dev_list = ibv_get_device_list(NULL);
249	if (!dev_list) {
250		return NULL;
251	}
252
253	for (i = 0; dev_list[i]; i++) {
254		if (ibv_get_device_guid(dev_list[i]) == guid) {
255			verbs = ibv_open_device(dev_list[i]);
256			break;
257		}
258	}
259
260	ibv_free_device_list(dev_list);
261	return verbs;
262}
263
264static int ucma_init_device(struct cma_device *cma_dev)
265{
266	struct ibv_port_attr port_attr;
267	struct ibv_device_attr attr;
268	int i, ret;
269
270	if (cma_dev->verbs)
271		return 0;
272
273	cma_dev->verbs = ucma_open_device(cma_dev->guid);
274	if (!cma_dev->verbs)
275		return ERR(ENODEV);
276
277	ret = ibv_query_device(cma_dev->verbs, &attr);
278	if (ret) {
279		ret = ERR(ret);
280		goto err;
281	}
282
283	cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt);
284	if (!cma_dev->port) {
285		ret = ERR(ENOMEM);
286		goto err;
287	}
288
289	for (i = 1; i <= attr.phys_port_cnt; i++) {
290		if (ibv_query_port(cma_dev->verbs, i, &port_attr))
291			cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED;
292		else
293			cma_dev->port[i - 1].link_layer = port_attr.link_layer;
294	}
295
296	cma_dev->port_cnt = attr.phys_port_cnt;
297	cma_dev->max_qpsize = attr.max_qp_wr;
298	cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
299	cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
300	cma_init_cnt++;
301	return 0;
302
303err:
304	ibv_close_device(cma_dev->verbs);
305	cma_dev->verbs = NULL;
306	return ret;
307}
308
309static int ucma_init_all(void)
310{
311	int i, ret = 0;
312
313	if (!cma_dev_cnt) {
314		ret = ucma_init();
315		if (ret)
316			return ret;
317	}
318
319	if (cma_init_cnt == cma_dev_cnt)
320		return 0;
321
322	pthread_mutex_lock(&mut);
323	for (i = 0; i < cma_dev_cnt; i++) {
324		ret = ucma_init_device(&cma_dev_array[i]);
325		if (ret)
326			break;
327	}
328	pthread_mutex_unlock(&mut);
329	return ret;
330}
331
332struct ibv_context **rdma_get_devices(int *num_devices)
333{
334	struct ibv_context **devs = NULL;
335	int i;
336
337	if (ucma_init_all())
338		goto out;
339
340	devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1));
341	if (!devs)
342		goto out;
343
344	for (i = 0; i < cma_dev_cnt; i++)
345		devs[i] = cma_dev_array[i].verbs;
346	devs[i] = NULL;
347out:
348	if (num_devices)
349		*num_devices = devs ? cma_dev_cnt : 0;
350	return devs;
351}
352
353void rdma_free_devices(struct ibv_context **list)
354{
355	free(list);
356}
357
358struct rdma_event_channel *rdma_create_event_channel(void)
359{
360	struct rdma_event_channel *channel;
361
362	if (ucma_init())
363		return NULL;
364
365	channel = malloc(sizeof(*channel));
366	if (!channel)
367		return NULL;
368
369	channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC);
370	if (channel->fd < 0) {
371		goto err;
372	}
373	return channel;
374err:
375	free(channel);
376	return NULL;
377}
378
379void rdma_destroy_event_channel(struct rdma_event_channel *channel)
380{
381	close(channel->fd);
382	free(channel);
383}
384
385static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid)
386{
387	struct cma_device *cma_dev;
388	int i, ret;
389
390	for (i = 0; i < cma_dev_cnt; i++) {
391		cma_dev = &cma_dev_array[i];
392		if (cma_dev->guid == guid)
393			goto match;
394	}
395
396	return ERR(ENODEV);
397match:
398	pthread_mutex_lock(&mut);
399	if ((ret = ucma_init_device(cma_dev)))
400		goto out;
401
402	if (!cma_dev->refcnt++) {
403		cma_dev->pd = ibv_alloc_pd(cma_dev->verbs);
404		if (!cma_dev->pd) {
405			cma_dev->refcnt--;
406			ret = ERR(ENOMEM);
407			goto out;
408		}
409	}
410	id_priv->cma_dev = cma_dev;
411	id_priv->id.verbs = cma_dev->verbs;
412	id_priv->id.pd = cma_dev->pd;
413out:
414	pthread_mutex_unlock(&mut);
415	return ret;
416}
417
418static void ucma_put_device(struct cma_device *cma_dev)
419{
420	pthread_mutex_lock(&mut);
421	if (!--cma_dev->refcnt) {
422		ibv_dealloc_pd(cma_dev->pd);
423		if (cma_dev->xrcd)
424			ibv_close_xrcd(cma_dev->xrcd);
425	}
426	pthread_mutex_unlock(&mut);
427}
428
429static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev)
430{
431	struct ibv_xrcd_init_attr attr;
432
433	pthread_mutex_lock(&mut);
434	if (!cma_dev->xrcd) {
435		memset(&attr, 0, sizeof attr);
436		attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS;
437		attr.fd = -1;
438		attr.oflags = O_CREAT;
439		cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr);
440	}
441	pthread_mutex_unlock(&mut);
442	return cma_dev->xrcd;
443}
444
445static void ucma_insert_id(struct cma_id_private *id_priv)
446{
447	fastlock_acquire(&idm_lock);
448	idm_set(&ucma_idm, id_priv->handle, id_priv);
449	fastlock_release(&idm_lock);
450}
451
452static void ucma_remove_id(struct cma_id_private *id_priv)
453{
454	if (id_priv->handle <= IDX_MAX_INDEX)
455		idm_clear(&ucma_idm, id_priv->handle);
456}
457
458static struct cma_id_private *ucma_lookup_id(int handle)
459{
460	return idm_lookup(&ucma_idm, handle);
461}
462
463static void ucma_free_id(struct cma_id_private *id_priv)
464{
465	ucma_remove_id(id_priv);
466	if (id_priv->cma_dev)
467		ucma_put_device(id_priv->cma_dev);
468	pthread_cond_destroy(&id_priv->cond);
469	pthread_mutex_destroy(&id_priv->mut);
470	if (id_priv->id.route.path_rec)
471		free(id_priv->id.route.path_rec);
472
473	if (id_priv->sync)
474		rdma_destroy_event_channel(id_priv->id.channel);
475	if (id_priv->connect_len)
476		free(id_priv->connect);
477	free(id_priv);
478}
479
480static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel,
481					    void *context,
482					    enum rdma_port_space ps,
483					    enum ibv_qp_type qp_type)
484{
485	struct cma_id_private *id_priv;
486
487	id_priv = calloc(1, sizeof(*id_priv));
488	if (!id_priv)
489		return NULL;
490
491	id_priv->id.context = context;
492	id_priv->id.ps = ps;
493	id_priv->id.qp_type = qp_type;
494	id_priv->handle = 0xFFFFFFFF;
495
496	if (!channel) {
497		id_priv->id.channel = rdma_create_event_channel();
498		if (!id_priv->id.channel)
499			goto err;
500		id_priv->sync = 1;
501	} else {
502		id_priv->id.channel = channel;
503	}
504
505	pthread_mutex_init(&id_priv->mut, NULL);
506	if (pthread_cond_init(&id_priv->cond, NULL))
507		goto err;
508
509	return id_priv;
510
511err:	ucma_free_id(id_priv);
512	return NULL;
513}
514
515static int rdma_create_id2(struct rdma_event_channel *channel,
516			   struct rdma_cm_id **id, void *context,
517			   enum rdma_port_space ps, enum ibv_qp_type qp_type)
518{
519	struct ucma_abi_create_id_resp resp;
520	struct ucma_abi_create_id cmd;
521	struct cma_id_private *id_priv;
522	int ret;
523
524	ret = ucma_init();
525	if (ret)
526		return ret;
527
528	id_priv = ucma_alloc_id(channel, context, ps, qp_type);
529	if (!id_priv)
530		return ERR(ENOMEM);
531
532	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp);
533	cmd.uid = (uintptr_t) id_priv;
534	cmd.ps = ps;
535	cmd.qp_type = qp_type;
536
537	ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
538	if (ret != sizeof cmd)
539		goto err;
540
541	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
542
543	id_priv->handle = resp.id;
544	ucma_insert_id(id_priv);
545	*id = &id_priv->id;
546	return 0;
547
548err:	ucma_free_id(id_priv);
549	return ret;
550}
551
552int rdma_create_id(struct rdma_event_channel *channel,
553		   struct rdma_cm_id **id, void *context,
554		   enum rdma_port_space ps)
555{
556	enum ibv_qp_type qp_type;
557
558	qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ?
559		  IBV_QPT_UD : IBV_QPT_RC;
560	return rdma_create_id2(channel, id, context, ps, qp_type);
561}
562
563static int ucma_destroy_kern_id(int fd, uint32_t handle)
564{
565	struct ucma_abi_destroy_id_resp resp;
566	struct ucma_abi_destroy_id cmd;
567	int ret;
568
569	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp);
570	cmd.id = handle;
571
572	ret = write(fd, &cmd, sizeof cmd);
573	if (ret != sizeof cmd)
574		return (ret >= 0) ? ERR(ENODATA) : -1;
575
576	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
577
578	return resp.events_reported;
579}
580
581int rdma_destroy_id(struct rdma_cm_id *id)
582{
583	struct cma_id_private *id_priv;
584	int ret;
585
586	id_priv = container_of(id, struct cma_id_private, id);
587	ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle);
588	if (ret < 0)
589		return ret;
590
591	if (id_priv->id.event)
592		rdma_ack_cm_event(id_priv->id.event);
593
594	pthread_mutex_lock(&id_priv->mut);
595	while (id_priv->events_completed < ret)
596		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
597	pthread_mutex_unlock(&id_priv->mut);
598
599	ucma_free_id(id_priv);
600	return 0;
601}
602
603int ucma_addrlen(struct sockaddr *addr)
604{
605	if (!addr)
606		return 0;
607
608	switch (addr->sa_family) {
609	case PF_INET:
610		return sizeof(struct sockaddr_in);
611	case PF_INET6:
612		return sizeof(struct sockaddr_in6);
613	case PF_IB:
614		return af_ib_support ? sizeof(struct sockaddr_ib) : 0;
615	default:
616		return 0;
617	}
618}
619
620static int ucma_query_addr(struct rdma_cm_id *id)
621{
622	struct ucma_abi_query_addr_resp resp;
623	struct ucma_abi_query cmd;
624	struct cma_id_private *id_priv;
625	int ret;
626
627	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
628	id_priv = container_of(id, struct cma_id_private, id);
629	cmd.id = id_priv->handle;
630	cmd.option = UCMA_QUERY_ADDR;
631
632	ret = write(id->channel->fd, &cmd, sizeof cmd);
633	if (ret != sizeof cmd)
634		return (ret >= 0) ? ERR(ENODATA) : -1;
635
636	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
637
638	memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size);
639	memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size);
640
641	if (!id_priv->cma_dev && resp.node_guid) {
642		ret = ucma_get_device(id_priv, resp.node_guid);
643		if (ret)
644			return ret;
645		id->port_num = resp.port_num;
646		id->route.addr.addr.ibaddr.pkey = resp.pkey;
647	}
648
649	return 0;
650}
651
652static int ucma_query_gid(struct rdma_cm_id *id)
653{
654	struct ucma_abi_query_addr_resp resp;
655	struct ucma_abi_query cmd;
656	struct cma_id_private *id_priv;
657	struct sockaddr_ib *sib;
658	int ret;
659
660	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
661	id_priv = container_of(id, struct cma_id_private, id);
662	cmd.id = id_priv->handle;
663	cmd.option = UCMA_QUERY_GID;
664
665	ret = write(id->channel->fd, &cmd, sizeof cmd);
666	if (ret != sizeof cmd)
667		return (ret >= 0) ? ERR(ENODATA) : -1;
668
669	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
670
671	sib = (struct sockaddr_ib *) &resp.src_addr;
672	memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw,
673	       sizeof id->route.addr.addr.ibaddr.sgid);
674
675	sib = (struct sockaddr_ib *) &resp.dst_addr;
676	memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw,
677	       sizeof id->route.addr.addr.ibaddr.dgid);
678
679	return 0;
680}
681
682static void ucma_convert_path(struct ibv_path_data *path_data,
683			      struct ibv_sa_path_rec *sa_path)
684{
685	uint32_t fl_hop;
686
687	sa_path->dgid = path_data->path.dgid;
688	sa_path->sgid = path_data->path.sgid;
689	sa_path->dlid = path_data->path.dlid;
690	sa_path->slid = path_data->path.slid;
691	sa_path->raw_traffic = 0;
692
693	fl_hop = be32toh(path_data->path.flowlabel_hoplimit);
694	sa_path->flow_label = htobe32(fl_hop >> 8);
695	sa_path->hop_limit = (uint8_t) fl_hop;
696
697	sa_path->traffic_class = path_data->path.tclass;
698	sa_path->reversible = path_data->path.reversible_numpath >> 7;
699	sa_path->numb_path = 1;
700	sa_path->pkey = path_data->path.pkey;
701	sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF;
702	sa_path->mtu_selector = 2;	/* exactly */
703	sa_path->mtu = path_data->path.mtu & 0x1F;
704	sa_path->rate_selector = 2;
705	sa_path->rate = path_data->path.rate & 0x1F;
706	sa_path->packet_life_time_selector = 2;
707	sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F;
708
709	sa_path->preference = (uint8_t) path_data->flags;
710}
711
712static int ucma_query_path(struct rdma_cm_id *id)
713{
714	struct ucma_abi_query_path_resp *resp;
715	struct ucma_abi_query cmd;
716	struct cma_id_private *id_priv;
717	int ret, i, size;
718
719	size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6;
720	resp = alloca(size);
721	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size);
722	id_priv = container_of(id, struct cma_id_private, id);
723	cmd.id = id_priv->handle;
724	cmd.option = UCMA_QUERY_PATH;
725
726	ret = write(id->channel->fd, &cmd, sizeof cmd);
727	if (ret != sizeof cmd)
728		return (ret >= 0) ? ERR(ENODATA) : -1;
729
730	VALGRIND_MAKE_MEM_DEFINED(resp, size);
731
732	if (resp->num_paths) {
733		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
734					    resp->num_paths);
735		if (!id->route.path_rec)
736			return ERR(ENOMEM);
737
738		id->route.num_paths = resp->num_paths;
739		for (i = 0; i < resp->num_paths; i++)
740			ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]);
741	}
742
743	return 0;
744}
745
746static int ucma_query_route(struct rdma_cm_id *id)
747{
748	struct ucma_abi_query_route_resp resp;
749	struct ucma_abi_query cmd;
750	struct cma_id_private *id_priv;
751	int ret, i;
752
753	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp);
754	id_priv = container_of(id, struct cma_id_private, id);
755	cmd.id = id_priv->handle;
756
757	ret = write(id->channel->fd, &cmd, sizeof cmd);
758	if (ret != sizeof cmd)
759		return (ret >= 0) ? ERR(ENODATA) : -1;
760
761	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
762
763	if (resp.num_paths) {
764		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
765					    resp.num_paths);
766		if (!id->route.path_rec)
767			return ERR(ENOMEM);
768
769		id->route.num_paths = resp.num_paths;
770		for (i = 0; i < resp.num_paths; i++)
771			ibv_copy_path_rec_from_kern(&id->route.path_rec[i],
772						    &resp.ib_route[i]);
773	}
774
775	memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid,
776	       sizeof id->route.addr.addr.ibaddr.sgid);
777	memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid,
778	       sizeof id->route.addr.addr.ibaddr.dgid);
779	id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey;
780	memcpy(&id->route.addr.src_addr, &resp.src_addr,
781	       sizeof resp.src_addr);
782	memcpy(&id->route.addr.dst_addr, &resp.dst_addr,
783	       sizeof resp.dst_addr);
784
785	if (!id_priv->cma_dev && resp.node_guid) {
786		ret = ucma_get_device(id_priv, resp.node_guid);
787		if (ret)
788			return ret;
789		id_priv->id.port_num = resp.port_num;
790	}
791
792	return 0;
793}
794
795static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr,
796			   socklen_t addrlen)
797{
798	struct ucma_abi_bind cmd;
799	struct cma_id_private *id_priv;
800	int ret;
801
802	CMA_INIT_CMD(&cmd, sizeof cmd, BIND);
803	id_priv = container_of(id, struct cma_id_private, id);
804	cmd.id = id_priv->handle;
805	cmd.addr_size = addrlen;
806	memcpy(&cmd.addr, addr, addrlen);
807
808	ret = write(id->channel->fd, &cmd, sizeof cmd);
809	if (ret != sizeof cmd)
810		return (ret >= 0) ? ERR(ENODATA) : -1;
811
812	ret = ucma_query_addr(id);
813	if (!ret)
814		ret = ucma_query_gid(id);
815	return ret;
816}
817
818int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
819{
820	struct ucma_abi_bind_ip cmd;
821	struct cma_id_private *id_priv;
822	int ret, addrlen;
823
824	addrlen = ucma_addrlen(addr);
825	if (!addrlen)
826		return ERR(EINVAL);
827
828	if (af_ib_support)
829		return rdma_bind_addr2(id, addr, addrlen);
830
831	CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP);
832	id_priv = container_of(id, struct cma_id_private, id);
833	cmd.id = id_priv->handle;
834	memcpy(&cmd.addr, addr, addrlen);
835
836	ret = write(id->channel->fd, &cmd, sizeof cmd);
837	if (ret != sizeof cmd)
838		return (ret >= 0) ? ERR(ENODATA) : -1;
839
840	return ucma_query_route(id);
841}
842
843int ucma_complete(struct rdma_cm_id *id)
844{
845	struct cma_id_private *id_priv;
846	int ret;
847
848	id_priv = container_of(id, struct cma_id_private, id);
849	if (!id_priv->sync)
850		return 0;
851
852	if (id_priv->id.event) {
853		rdma_ack_cm_event(id_priv->id.event);
854		id_priv->id.event = NULL;
855	}
856
857	ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event);
858	if (ret)
859		return ret;
860
861	if (id_priv->id.event->status) {
862		if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED)
863			ret = ERR(ECONNREFUSED);
864		else if (id_priv->id.event->status < 0)
865			ret = ERR(-id_priv->id.event->status);
866		else
867			ret = ERR(-id_priv->id.event->status);
868	}
869	return ret;
870}
871
872static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr,
873			      socklen_t src_len, struct sockaddr *dst_addr,
874			      socklen_t dst_len, int timeout_ms)
875{
876	struct ucma_abi_resolve_addr cmd;
877	struct cma_id_private *id_priv;
878	int ret;
879
880	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR);
881	id_priv = container_of(id, struct cma_id_private, id);
882	cmd.id = id_priv->handle;
883	if ((cmd.src_size = src_len))
884		memcpy(&cmd.src_addr, src_addr, src_len);
885	memcpy(&cmd.dst_addr, dst_addr, dst_len);
886	cmd.dst_size = dst_len;
887	cmd.timeout_ms = timeout_ms;
888
889	ret = write(id->channel->fd, &cmd, sizeof cmd);
890	if (ret != sizeof cmd)
891		return (ret >= 0) ? ERR(ENODATA) : -1;
892
893	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
894	return ucma_complete(id);
895}
896
897int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
898		      struct sockaddr *dst_addr, int timeout_ms)
899{
900	struct ucma_abi_resolve_ip cmd;
901	struct cma_id_private *id_priv;
902	int ret, dst_len, src_len;
903
904	dst_len = ucma_addrlen(dst_addr);
905	if (!dst_len)
906		return ERR(EINVAL);
907
908	src_len = ucma_addrlen(src_addr);
909	if (src_addr && !src_len)
910		return ERR(EINVAL);
911
912	if (af_ib_support)
913		return rdma_resolve_addr2(id, src_addr, src_len, dst_addr,
914					  dst_len, timeout_ms);
915
916	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP);
917	id_priv = container_of(id, struct cma_id_private, id);
918	cmd.id = id_priv->handle;
919	if (src_addr)
920		memcpy(&cmd.src_addr, src_addr, src_len);
921	memcpy(&cmd.dst_addr, dst_addr, dst_len);
922	cmd.timeout_ms = timeout_ms;
923
924	ret = write(id->channel->fd, &cmd, sizeof cmd);
925	if (ret != sizeof cmd)
926		return (ret >= 0) ? ERR(ENODATA) : -1;
927
928	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
929	return ucma_complete(id);
930}
931
932static int ucma_set_ib_route(struct rdma_cm_id *id)
933{
934	struct rdma_addrinfo hint, *rai;
935	int ret;
936
937	memset(&hint, 0, sizeof hint);
938	hint.ai_flags = RAI_ROUTEONLY;
939	hint.ai_family = id->route.addr.src_addr.sa_family;
940	hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr);
941	hint.ai_src_addr = &id->route.addr.src_addr;
942	hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr);
943	hint.ai_dst_addr = &id->route.addr.dst_addr;
944
945	ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai);
946	if (ret)
947		return ret;
948
949	if (rai->ai_route_len)
950		ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
951				      rai->ai_route, rai->ai_route_len);
952	else
953		ret = -1;
954
955	rdma_freeaddrinfo(rai);
956	return ret;
957}
958
959int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
960{
961	struct ucma_abi_resolve_route cmd;
962	struct cma_id_private *id_priv;
963	int ret;
964
965	id_priv = container_of(id, struct cma_id_private, id);
966	if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) {
967		ret = ucma_set_ib_route(id);
968		if (!ret)
969			goto out;
970	}
971
972	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE);
973	cmd.id = id_priv->handle;
974	cmd.timeout_ms = timeout_ms;
975
976	ret = write(id->channel->fd, &cmd, sizeof cmd);
977	if (ret != sizeof cmd)
978		return (ret >= 0) ? ERR(ENODATA) : -1;
979
980out:
981	return ucma_complete(id);
982}
983
984static int ucma_is_ud_qp(enum ibv_qp_type qp_type)
985{
986	return (qp_type == IBV_QPT_UD);
987}
988
989static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr,
990			     int *qp_attr_mask)
991{
992	struct ucma_abi_init_qp_attr cmd;
993	struct ibv_kern_qp_attr resp;
994	struct cma_id_private *id_priv;
995	int ret;
996
997	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp);
998	id_priv = container_of(id, struct cma_id_private, id);
999	cmd.id = id_priv->handle;
1000	cmd.qp_state = qp_attr->qp_state;
1001
1002	ret = write(id->channel->fd, &cmd, sizeof cmd);
1003	if (ret != sizeof cmd)
1004		return (ret >= 0) ? ERR(ENODATA) : -1;
1005
1006	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1007
1008	ibv_copy_qp_attr_from_kern(qp_attr, &resp);
1009	*qp_attr_mask = resp.qp_attr_mask;
1010	return 0;
1011}
1012
1013static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res)
1014{
1015	struct cma_id_private *id_priv;
1016	struct ibv_qp_attr qp_attr;
1017	int qp_attr_mask, ret;
1018	uint8_t link_layer;
1019
1020	if (!id->qp)
1021		return ERR(EINVAL);
1022
1023	/* Need to update QP attributes from default values. */
1024	qp_attr.qp_state = IBV_QPS_INIT;
1025	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1026	if (ret)
1027		return ret;
1028
1029	ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
1030	if (ret)
1031		return ERR(ret);
1032
1033	qp_attr.qp_state = IBV_QPS_RTR;
1034	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1035	if (ret)
1036		return ret;
1037
1038	/*
1039	 * Workaround for rdma_ucm kernel bug:
1040	 * mask off qp_attr_mask bits 21-24 which are used for RoCE
1041	 */
1042	id_priv = container_of(id, struct cma_id_private, id);
1043	link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer;
1044
1045	if (link_layer == IBV_LINK_LAYER_INFINIBAND)
1046		qp_attr_mask &= UINT_MAX ^ 0xe00000;
1047
1048	if (resp_res != RDMA_MAX_RESP_RES)
1049		qp_attr.max_dest_rd_atomic = resp_res;
1050	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1051}
1052
1053static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth)
1054{
1055	struct ibv_qp_attr qp_attr;
1056	int qp_attr_mask, ret;
1057
1058	qp_attr.qp_state = IBV_QPS_RTS;
1059	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1060	if (ret)
1061		return ret;
1062
1063	if (init_depth != RDMA_MAX_INIT_DEPTH)
1064		qp_attr.max_rd_atomic = init_depth;
1065	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1066}
1067
1068static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
1069{
1070	struct ibv_qp_attr qp_attr;
1071
1072	if (!id->qp)
1073		return 0;
1074
1075	qp_attr.qp_state = IBV_QPS_SQD;
1076	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1077}
1078
1079static int ucma_modify_qp_err(struct rdma_cm_id *id)
1080{
1081	struct ibv_qp_attr qp_attr;
1082
1083	if (!id->qp)
1084		return 0;
1085
1086	qp_attr.qp_state = IBV_QPS_ERR;
1087	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1088}
1089
1090static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num,
1091			  __be16 pkey, uint16_t *pkey_index)
1092{
1093	int ret, i;
1094	__be16 chk_pkey;
1095
1096	for (i = 0, ret = 0; !ret; i++) {
1097		ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey);
1098		if (!ret && pkey == chk_pkey) {
1099			*pkey_index = (uint16_t) i;
1100			return 0;
1101		}
1102	}
1103	return ERR(EINVAL);
1104}
1105
1106static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1107{
1108	struct ibv_qp_attr qp_attr;
1109	int ret;
1110
1111	ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1112			     id_priv->id.route.addr.addr.ibaddr.pkey,
1113			     &qp_attr.pkey_index);
1114	if (ret)
1115		return ret;
1116
1117	qp_attr.port_num = id_priv->id.port_num;
1118	qp_attr.qp_state = IBV_QPS_INIT;
1119	qp_attr.qp_access_flags = 0;
1120
1121	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
1122					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1123	return rdma_seterrno(ret);
1124}
1125
1126static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1127{
1128	struct ibv_qp_attr qp_attr;
1129	int qp_attr_mask, ret;
1130
1131	if (abi_ver == 3)
1132		return ucma_init_conn_qp3(id_priv, qp);
1133
1134	qp_attr.qp_state = IBV_QPS_INIT;
1135	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1136	if (ret)
1137		return ret;
1138
1139	return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask));
1140}
1141
1142static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1143{
1144	struct ibv_qp_attr qp_attr;
1145	int ret;
1146
1147	ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1148			     id_priv->id.route.addr.addr.ibaddr.pkey,
1149			     &qp_attr.pkey_index);
1150	if (ret)
1151		return ret;
1152
1153	qp_attr.port_num = id_priv->id.port_num;
1154	qp_attr.qp_state = IBV_QPS_INIT;
1155	qp_attr.qkey = RDMA_UDP_QKEY;
1156
1157	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY |
1158					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1159	if (ret)
1160		return ERR(ret);
1161
1162	qp_attr.qp_state = IBV_QPS_RTR;
1163	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1164	if (ret)
1165		return ERR(ret);
1166
1167	qp_attr.qp_state = IBV_QPS_RTS;
1168	qp_attr.sq_psn = 0;
1169	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1170	return rdma_seterrno(ret);
1171}
1172
1173static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1174{
1175	struct ibv_qp_attr qp_attr;
1176	int qp_attr_mask, ret;
1177
1178	if (abi_ver == 3)
1179		return ucma_init_ud_qp3(id_priv, qp);
1180
1181	qp_attr.qp_state = IBV_QPS_INIT;
1182	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1183	if (ret)
1184		return ret;
1185
1186	ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
1187	if (ret)
1188		return ERR(ret);
1189
1190	qp_attr.qp_state = IBV_QPS_RTR;
1191	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1192	if (ret)
1193		return ERR(ret);
1194
1195	qp_attr.qp_state = IBV_QPS_RTS;
1196	qp_attr.sq_psn = 0;
1197	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1198	return rdma_seterrno(ret);
1199}
1200
1201static void ucma_destroy_cqs(struct rdma_cm_id *id)
1202{
1203	if (id->qp_type == IBV_QPT_XRC_RECV && id->srq)
1204		return;
1205
1206	if (id->recv_cq) {
1207		ibv_destroy_cq(id->recv_cq);
1208		if (id->send_cq && (id->send_cq != id->recv_cq)) {
1209			ibv_destroy_cq(id->send_cq);
1210			id->send_cq = NULL;
1211		}
1212		id->recv_cq = NULL;
1213	}
1214
1215	if (id->recv_cq_channel) {
1216		ibv_destroy_comp_channel(id->recv_cq_channel);
1217		if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) {
1218			ibv_destroy_comp_channel(id->send_cq_channel);
1219			id->send_cq_channel = NULL;
1220		}
1221		id->recv_cq_channel = NULL;
1222	}
1223}
1224
1225static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size)
1226{
1227	if (recv_size) {
1228		id->recv_cq_channel = ibv_create_comp_channel(id->verbs);
1229		if (!id->recv_cq_channel)
1230			goto err;
1231
1232		id->recv_cq = ibv_create_cq(id->verbs, recv_size,
1233					    id, id->recv_cq_channel, 0);
1234		if (!id->recv_cq)
1235			goto err;
1236	}
1237
1238	if (send_size) {
1239		id->send_cq_channel = ibv_create_comp_channel(id->verbs);
1240		if (!id->send_cq_channel)
1241			goto err;
1242
1243		id->send_cq = ibv_create_cq(id->verbs, send_size,
1244					    id, id->send_cq_channel, 0);
1245		if (!id->send_cq)
1246			goto err;
1247	}
1248
1249	return 0;
1250err:
1251	ucma_destroy_cqs(id);
1252	return ERR(ENOMEM);
1253}
1254
1255int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr)
1256{
1257	struct cma_id_private *id_priv;
1258	struct ibv_srq *srq;
1259	int ret;
1260
1261	id_priv = container_of(id, struct cma_id_private, id);
1262	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE))
1263		return ERR(EINVAL);
1264
1265	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) {
1266		attr->pd = id->pd;
1267		attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD;
1268	}
1269
1270	if (attr->srq_type == IBV_SRQT_XRC) {
1271		if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) {
1272			attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1273			if (!attr->xrcd)
1274				return -1;
1275		}
1276		if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) {
1277			ret = ucma_create_cqs(id, 0, attr->attr.max_wr);
1278			if (ret)
1279				return ret;
1280			attr->cq = id->recv_cq;
1281		}
1282		attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ;
1283	}
1284
1285	srq = ibv_create_srq_ex(id->verbs, attr);
1286	if (!srq) {
1287		ret = -1;
1288		goto err;
1289	}
1290
1291	if (!id->pd)
1292		id->pd = attr->pd;
1293	id->srq = srq;
1294	return 0;
1295err:
1296	ucma_destroy_cqs(id);
1297	return ret;
1298}
1299
1300int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd,
1301		    struct ibv_srq_init_attr *attr)
1302{
1303	struct ibv_srq_init_attr_ex attr_ex;
1304	int ret;
1305
1306	memcpy(&attr_ex, attr, sizeof(*attr));
1307	attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD;
1308	if (id->qp_type == IBV_QPT_XRC_RECV) {
1309		attr_ex.srq_type = IBV_SRQT_XRC;
1310	} else {
1311		attr_ex.srq_type = IBV_SRQT_BASIC;
1312	}
1313	attr_ex.pd = pd;
1314	ret = rdma_create_srq_ex(id, &attr_ex);
1315	memcpy(attr, &attr_ex, sizeof(*attr));
1316	return ret;
1317}
1318
1319void rdma_destroy_srq(struct rdma_cm_id *id)
1320{
1321	ibv_destroy_srq(id->srq);
1322	id->srq = NULL;
1323	ucma_destroy_cqs(id);
1324}
1325
1326int rdma_create_qp_ex(struct rdma_cm_id *id,
1327		      struct ibv_qp_init_attr_ex *attr)
1328{
1329	struct cma_id_private *id_priv;
1330	struct ibv_qp *qp;
1331	int ret;
1332
1333	if (id->qp)
1334		return ERR(EINVAL);
1335
1336	id_priv = container_of(id, struct cma_id_private, id);
1337	if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) {
1338		attr->comp_mask |= IBV_QP_INIT_ATTR_PD;
1339		attr->pd = id->pd;
1340	} else if (id->verbs != attr->pd->context)
1341		return ERR(EINVAL);
1342
1343	if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) ||
1344	    (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq))
1345		return ERR(EINVAL);
1346
1347	if (id->qp_type == IBV_QPT_XRC_RECV) {
1348		if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) {
1349			attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1350			if (!attr->xrcd)
1351				return -1;
1352			attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD;
1353		}
1354	}
1355
1356	ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr,
1357				  attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr);
1358	if (ret)
1359		return ret;
1360
1361	if (!attr->send_cq)
1362		attr->send_cq = id->send_cq;
1363	if (!attr->recv_cq)
1364		attr->recv_cq = id->recv_cq;
1365	if (id->srq && !attr->srq)
1366		attr->srq = id->srq;
1367	qp = ibv_create_qp_ex(id->verbs, attr);
1368	if (!qp) {
1369		ret = ERR(ENOMEM);
1370		goto err1;
1371	}
1372
1373	if (ucma_is_ud_qp(id->qp_type))
1374		ret = ucma_init_ud_qp(id_priv, qp);
1375	else
1376		ret = ucma_init_conn_qp(id_priv, qp);
1377	if (ret)
1378		goto err2;
1379
1380	id->pd = qp->pd;
1381	id->qp = qp;
1382	return 0;
1383err2:
1384	ibv_destroy_qp(qp);
1385err1:
1386	ucma_destroy_cqs(id);
1387	return ret;
1388}
1389
1390int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
1391		   struct ibv_qp_init_attr *qp_init_attr)
1392{
1393	struct ibv_qp_init_attr_ex attr_ex;
1394	int ret;
1395
1396	memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr));
1397	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
1398	attr_ex.pd = pd ? pd : id->pd;
1399	ret = rdma_create_qp_ex(id, &attr_ex);
1400	memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr));
1401	return ret;
1402}
1403
1404void rdma_destroy_qp(struct rdma_cm_id *id)
1405{
1406	ibv_destroy_qp(id->qp);
1407	id->qp = NULL;
1408	ucma_destroy_cqs(id);
1409}
1410
1411static int ucma_valid_param(struct cma_id_private *id_priv,
1412			    struct rdma_conn_param *param)
1413{
1414	if (id_priv->id.ps != RDMA_PS_TCP)
1415		return 0;
1416
1417	if (!id_priv->id.qp && !param)
1418		goto err;
1419
1420	if (!param)
1421		return 0;
1422
1423	if ((param->responder_resources != RDMA_MAX_RESP_RES) &&
1424	    (param->responder_resources > id_priv->cma_dev->max_responder_resources))
1425		goto err;
1426
1427	if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) &&
1428	    (param->initiator_depth > id_priv->cma_dev->max_initiator_depth))
1429		goto err;
1430
1431	return 0;
1432err:
1433	return ERR(EINVAL);
1434}
1435
1436static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv,
1437					 struct ucma_abi_conn_param *dst,
1438					 struct rdma_conn_param *src,
1439					 uint32_t qp_num, uint8_t srq)
1440{
1441	dst->qp_num = qp_num;
1442	dst->srq = srq;
1443	dst->responder_resources = id_priv->responder_resources;
1444	dst->initiator_depth = id_priv->initiator_depth;
1445	dst->valid = 1;
1446
1447	if (id_priv->connect_len) {
1448		memcpy(dst->private_data, id_priv->connect, id_priv->connect_len);
1449		dst->private_data_len = id_priv->connect_len;
1450	}
1451
1452	if (src) {
1453		dst->flow_control = src->flow_control;
1454		dst->retry_count = src->retry_count;
1455		dst->rnr_retry_count = src->rnr_retry_count;
1456
1457		if (src->private_data && src->private_data_len) {
1458			memcpy(dst->private_data + dst->private_data_len,
1459			       src->private_data, src->private_data_len);
1460			dst->private_data_len += src->private_data_len;
1461		}
1462	} else {
1463		dst->retry_count = 7;
1464		dst->rnr_retry_count = 7;
1465	}
1466}
1467
1468int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1469{
1470	struct ucma_abi_connect cmd;
1471	struct cma_id_private *id_priv;
1472	int ret;
1473
1474	id_priv = container_of(id, struct cma_id_private, id);
1475	ret = ucma_valid_param(id_priv, conn_param);
1476	if (ret)
1477		return ret;
1478
1479	if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH)
1480		id_priv->initiator_depth = conn_param->initiator_depth;
1481	else
1482		id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth;
1483	if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES)
1484		id_priv->responder_resources = conn_param->responder_resources;
1485	else
1486		id_priv->responder_resources = id_priv->cma_dev->max_responder_resources;
1487
1488	CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT);
1489	cmd.id = id_priv->handle;
1490	if (id->qp) {
1491		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1492					     conn_param, id->qp->qp_num,
1493					     (id->qp->srq != NULL));
1494	} else if (conn_param) {
1495		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1496					     conn_param, conn_param->qp_num,
1497					     conn_param->srq);
1498	} else {
1499		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1500					     conn_param, 0, 0);
1501	}
1502
1503	ret = write(id->channel->fd, &cmd, sizeof cmd);
1504	if (ret != sizeof cmd)
1505		return (ret >= 0) ? ERR(ENODATA) : -1;
1506
1507	if (id_priv->connect_len) {
1508		free(id_priv->connect);
1509		id_priv->connect_len = 0;
1510	}
1511
1512	return ucma_complete(id);
1513}
1514
1515int rdma_listen(struct rdma_cm_id *id, int backlog)
1516{
1517	struct ucma_abi_listen cmd;
1518	struct cma_id_private *id_priv;
1519	int ret;
1520
1521	CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN);
1522	id_priv = container_of(id, struct cma_id_private, id);
1523	cmd.id = id_priv->handle;
1524	cmd.backlog = backlog;
1525
1526	ret = write(id->channel->fd, &cmd, sizeof cmd);
1527	if (ret != sizeof cmd)
1528		return (ret >= 0) ? ERR(ENODATA) : -1;
1529
1530	if (af_ib_support)
1531		return ucma_query_addr(id);
1532	else
1533		return ucma_query_route(id);
1534}
1535
1536int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id)
1537{
1538	struct cma_id_private *id_priv;
1539	struct rdma_cm_event *event;
1540	int ret;
1541
1542	id_priv = container_of(listen, struct cma_id_private, id);
1543	if (!id_priv->sync)
1544		return ERR(EINVAL);
1545
1546	if (listen->event) {
1547		rdma_ack_cm_event(listen->event);
1548		listen->event = NULL;
1549	}
1550
1551	ret = rdma_get_cm_event(listen->channel, &event);
1552	if (ret)
1553		return ret;
1554
1555	if (event->status) {
1556		ret = ERR(event->status);
1557		goto err;
1558	}
1559
1560	if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
1561		ret = ERR(EINVAL);
1562		goto err;
1563	}
1564
1565	if (id_priv->qp_init_attr) {
1566		struct ibv_qp_init_attr attr;
1567
1568		attr = *id_priv->qp_init_attr;
1569		ret = rdma_create_qp(event->id, listen->pd, &attr);
1570		if (ret)
1571			goto err;
1572	}
1573
1574	*id = event->id;
1575	(*id)->event = event;
1576	return 0;
1577
1578err:
1579	listen->event = event;
1580	return ret;
1581}
1582
1583int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1584{
1585	struct ucma_abi_accept cmd;
1586	struct cma_id_private *id_priv;
1587	int ret;
1588
1589	id_priv = container_of(id, struct cma_id_private, id);
1590	ret = ucma_valid_param(id_priv, conn_param);
1591	if (ret)
1592		return ret;
1593
1594	if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) {
1595		id_priv->initiator_depth = min(id_priv->initiator_depth,
1596					       id_priv->cma_dev->max_initiator_depth);
1597	} else {
1598		id_priv->initiator_depth = conn_param->initiator_depth;
1599	}
1600	if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) {
1601		id_priv->responder_resources = min(id_priv->responder_resources,
1602						   id_priv->cma_dev->max_responder_resources);
1603	} else {
1604		id_priv->responder_resources = conn_param->responder_resources;
1605	}
1606
1607	if (!ucma_is_ud_qp(id->qp_type)) {
1608		ret = ucma_modify_qp_rtr(id, id_priv->responder_resources);
1609		if (ret)
1610			return ret;
1611
1612		ret = ucma_modify_qp_rts(id, id_priv->initiator_depth);
1613		if (ret)
1614			return ret;
1615	}
1616
1617	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1618	cmd.id = id_priv->handle;
1619	cmd.uid = (uintptr_t) id_priv;
1620	if (id->qp)
1621		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1622					     conn_param, id->qp->qp_num,
1623					     (id->qp->srq != NULL));
1624	else
1625		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1626					     conn_param, conn_param->qp_num,
1627					     conn_param->srq);
1628
1629	ret = write(id->channel->fd, &cmd, sizeof cmd);
1630	if (ret != sizeof cmd) {
1631		ucma_modify_qp_err(id);
1632		return (ret >= 0) ? ERR(ENODATA) : -1;
1633	}
1634
1635	if (ucma_is_ud_qp(id->qp_type))
1636		return 0;
1637
1638	return ucma_complete(id);
1639}
1640
1641int rdma_reject(struct rdma_cm_id *id, const void *private_data,
1642		uint8_t private_data_len)
1643{
1644	struct ucma_abi_reject cmd;
1645	struct cma_id_private *id_priv;
1646	int ret;
1647
1648	CMA_INIT_CMD(&cmd, sizeof cmd, REJECT);
1649
1650	id_priv = container_of(id, struct cma_id_private, id);
1651	cmd.id = id_priv->handle;
1652	if (private_data && private_data_len) {
1653		memcpy(cmd.private_data, private_data, private_data_len);
1654		cmd.private_data_len = private_data_len;
1655	}
1656
1657	ret = write(id->channel->fd, &cmd, sizeof cmd);
1658	if (ret != sizeof cmd)
1659		return (ret >= 0) ? ERR(ENODATA) : -1;
1660
1661	return 0;
1662}
1663
1664int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
1665{
1666	struct ucma_abi_notify cmd;
1667	struct cma_id_private *id_priv;
1668	int ret;
1669
1670	CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY);
1671
1672	id_priv = container_of(id, struct cma_id_private, id);
1673	cmd.id = id_priv->handle;
1674	cmd.event = event;
1675	ret = write(id->channel->fd, &cmd, sizeof cmd);
1676	if (ret != sizeof cmd)
1677		return (ret >= 0) ? ERR(ENODATA) : -1;
1678
1679	return 0;
1680}
1681
1682int ucma_shutdown(struct rdma_cm_id *id)
1683{
1684	switch (id->verbs->device->transport_type) {
1685	case IBV_TRANSPORT_IB:
1686		return ucma_modify_qp_err(id);
1687	case IBV_TRANSPORT_IWARP:
1688		return ucma_modify_qp_sqd(id);
1689	default:
1690		return ERR(EINVAL);
1691	}
1692}
1693
1694int rdma_disconnect(struct rdma_cm_id *id)
1695{
1696	struct ucma_abi_disconnect cmd;
1697	struct cma_id_private *id_priv;
1698	int ret;
1699
1700	ret = ucma_shutdown(id);
1701	if (ret)
1702		return ret;
1703
1704	CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT);
1705	id_priv = container_of(id, struct cma_id_private, id);
1706	cmd.id = id_priv->handle;
1707
1708	ret = write(id->channel->fd, &cmd, sizeof cmd);
1709	if (ret != sizeof cmd)
1710		return (ret >= 0) ? ERR(ENODATA) : -1;
1711
1712	return ucma_complete(id);
1713}
1714
1715static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr,
1716				socklen_t addrlen, void *context)
1717{
1718	struct ucma_abi_create_id_resp resp;
1719	struct cma_id_private *id_priv;
1720	struct cma_multicast *mc, **pos;
1721	int ret;
1722
1723	id_priv = container_of(id, struct cma_id_private, id);
1724	mc = calloc(1, sizeof(*mc));
1725	if (!mc)
1726		return ERR(ENOMEM);
1727
1728	mc->context = context;
1729	mc->id_priv = id_priv;
1730	memcpy(&mc->addr, addr, addrlen);
1731	if (pthread_cond_init(&mc->cond, NULL)) {
1732		ret = -1;
1733		goto err1;
1734	}
1735
1736	pthread_mutex_lock(&id_priv->mut);
1737	mc->next = id_priv->mc_list;
1738	id_priv->mc_list = mc;
1739	pthread_mutex_unlock(&id_priv->mut);
1740
1741	if (af_ib_support) {
1742		struct ucma_abi_join_mcast cmd;
1743
1744		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp);
1745		cmd.id = id_priv->handle;
1746		memcpy(&cmd.addr, addr, addrlen);
1747		cmd.addr_size = addrlen;
1748		cmd.uid = (uintptr_t) mc;
1749		cmd.reserved = 0;
1750
1751		ret = write(id->channel->fd, &cmd, sizeof cmd);
1752		if (ret != sizeof cmd) {
1753			ret = (ret >= 0) ? ERR(ENODATA) : -1;
1754			goto err2;
1755		}
1756	} else {
1757		struct ucma_abi_join_ip_mcast cmd;
1758
1759		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp);
1760		cmd.id = id_priv->handle;
1761		memcpy(&cmd.addr, addr, addrlen);
1762		cmd.uid = (uintptr_t) mc;
1763
1764		ret = write(id->channel->fd, &cmd, sizeof cmd);
1765		if (ret != sizeof cmd) {
1766			ret = (ret >= 0) ? ERR(ENODATA) : -1;
1767			goto err2;
1768		}
1769	}
1770
1771	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1772
1773	mc->handle = resp.id;
1774	return ucma_complete(id);
1775
1776err2:
1777	pthread_mutex_lock(&id_priv->mut);
1778	for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next)
1779		;
1780	*pos = mc->next;
1781	pthread_mutex_unlock(&id_priv->mut);
1782err1:
1783	free(mc);
1784	return ret;
1785}
1786
1787int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
1788			void *context)
1789{
1790	int addrlen;
1791
1792	addrlen = ucma_addrlen(addr);
1793	if (!addrlen)
1794		return ERR(EINVAL);
1795
1796	return rdma_join_multicast2(id, addr, addrlen, context);
1797}
1798
1799int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
1800{
1801	struct ucma_abi_destroy_id cmd;
1802	struct ucma_abi_destroy_id_resp resp;
1803	struct cma_id_private *id_priv;
1804	struct cma_multicast *mc, **pos;
1805	int ret, addrlen;
1806
1807	addrlen = ucma_addrlen(addr);
1808	if (!addrlen)
1809		return ERR(EINVAL);
1810
1811	id_priv = container_of(id, struct cma_id_private, id);
1812	pthread_mutex_lock(&id_priv->mut);
1813	for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next)
1814		if (!memcmp(&(*pos)->addr, addr, addrlen))
1815			break;
1816
1817	mc = *pos;
1818	if (*pos)
1819		*pos = mc->next;
1820	pthread_mutex_unlock(&id_priv->mut);
1821	if (!mc)
1822		return ERR(EADDRNOTAVAIL);
1823
1824	if (id->qp)
1825		ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid);
1826
1827	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp);
1828	cmd.id = mc->handle;
1829
1830	ret = write(id->channel->fd, &cmd, sizeof cmd);
1831	if (ret != sizeof cmd) {
1832		ret = (ret >= 0) ? ERR(ENODATA) : -1;
1833		goto free;
1834	}
1835
1836	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1837
1838	pthread_mutex_lock(&id_priv->mut);
1839	while (mc->events_completed < resp.events_reported)
1840		pthread_cond_wait(&mc->cond, &id_priv->mut);
1841	pthread_mutex_unlock(&id_priv->mut);
1842
1843	ret = 0;
1844free:
1845	free(mc);
1846	return ret;
1847}
1848
1849static void ucma_complete_event(struct cma_id_private *id_priv)
1850{
1851	pthread_mutex_lock(&id_priv->mut);
1852	id_priv->events_completed++;
1853	pthread_cond_signal(&id_priv->cond);
1854	pthread_mutex_unlock(&id_priv->mut);
1855}
1856
1857static void ucma_complete_mc_event(struct cma_multicast *mc)
1858{
1859	pthread_mutex_lock(&mc->id_priv->mut);
1860	mc->events_completed++;
1861	pthread_cond_signal(&mc->cond);
1862	mc->id_priv->events_completed++;
1863	pthread_cond_signal(&mc->id_priv->cond);
1864	pthread_mutex_unlock(&mc->id_priv->mut);
1865}
1866
1867int rdma_ack_cm_event(struct rdma_cm_event *event)
1868{
1869	struct cma_event *evt;
1870
1871	if (!event)
1872		return ERR(EINVAL);
1873
1874	evt = container_of(event, struct cma_event, event);
1875
1876	if (evt->mc)
1877		ucma_complete_mc_event(evt->mc);
1878	else
1879		ucma_complete_event(evt->id_priv);
1880	free(evt);
1881	return 0;
1882}
1883
1884static void ucma_process_addr_resolved(struct cma_event *evt)
1885{
1886	if (af_ib_support) {
1887		evt->event.status = ucma_query_addr(&evt->id_priv->id);
1888		if (!evt->event.status &&
1889		    evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB)
1890			evt->event.status = ucma_query_gid(&evt->id_priv->id);
1891	} else {
1892		evt->event.status = ucma_query_route(&evt->id_priv->id);
1893	}
1894
1895	if (evt->event.status)
1896		evt->event.event = RDMA_CM_EVENT_ADDR_ERROR;
1897}
1898
1899static void ucma_process_route_resolved(struct cma_event *evt)
1900{
1901	if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB)
1902		return;
1903
1904	if (af_ib_support)
1905		evt->event.status = ucma_query_path(&evt->id_priv->id);
1906	else
1907		evt->event.status = ucma_query_route(&evt->id_priv->id);
1908
1909	if (evt->event.status)
1910		evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
1911}
1912
1913static int ucma_query_req_info(struct rdma_cm_id *id)
1914{
1915	int ret;
1916
1917	if (!af_ib_support)
1918		return ucma_query_route(id);
1919
1920	ret = ucma_query_addr(id);
1921	if (ret)
1922		return ret;
1923
1924	ret = ucma_query_gid(id);
1925	if (ret)
1926		return ret;
1927
1928	ret = ucma_query_path(id);
1929	if (ret)
1930		return ret;
1931
1932	return 0;
1933}
1934
1935static int ucma_process_conn_req(struct cma_event *evt,
1936				 uint32_t handle)
1937{
1938	struct cma_id_private *id_priv;
1939	int ret;
1940
1941	id_priv = ucma_alloc_id(evt->id_priv->id.channel,
1942				evt->id_priv->id.context, evt->id_priv->id.ps,
1943				evt->id_priv->id.qp_type);
1944	if (!id_priv) {
1945		ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle);
1946		ret = ERR(ENOMEM);
1947		goto err1;
1948	}
1949
1950	evt->event.listen_id = &evt->id_priv->id;
1951	evt->event.id = &id_priv->id;
1952	id_priv->handle = handle;
1953	ucma_insert_id(id_priv);
1954	id_priv->initiator_depth = evt->event.param.conn.initiator_depth;
1955	id_priv->responder_resources = evt->event.param.conn.responder_resources;
1956
1957	if (evt->id_priv->sync) {
1958		ret = rdma_migrate_id(&id_priv->id, NULL);
1959		if (ret)
1960			goto err2;
1961	}
1962
1963	ret = ucma_query_req_info(&id_priv->id);
1964	if (ret)
1965		goto err2;
1966
1967	return 0;
1968
1969err2:
1970	rdma_destroy_id(&id_priv->id);
1971err1:
1972	ucma_complete_event(evt->id_priv);
1973	return ret;
1974}
1975
1976static int ucma_process_conn_resp(struct cma_id_private *id_priv)
1977{
1978	struct ucma_abi_accept cmd;
1979	int ret;
1980
1981	ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES);
1982	if (ret)
1983		goto err;
1984
1985	ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH);
1986	if (ret)
1987		goto err;
1988
1989	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1990	cmd.id = id_priv->handle;
1991
1992	ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
1993	if (ret != sizeof cmd) {
1994		ret = (ret >= 0) ? ERR(ENODATA) : -1;
1995		goto err;
1996	}
1997
1998	return 0;
1999err:
2000	ucma_modify_qp_err(&id_priv->id);
2001	return ret;
2002}
2003
2004static int ucma_process_join(struct cma_event *evt)
2005{
2006	evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid;
2007	evt->mc->mlid = evt->event.param.ud.ah_attr.dlid;
2008
2009	if (!evt->id_priv->id.qp)
2010		return 0;
2011
2012	return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp,
2013					      &evt->mc->mgid, evt->mc->mlid));
2014}
2015
2016static void ucma_copy_conn_event(struct cma_event *event,
2017				 struct ucma_abi_conn_param *src)
2018{
2019	struct rdma_conn_param *dst = &event->event.param.conn;
2020
2021	dst->private_data_len = src->private_data_len;
2022	if (src->private_data_len) {
2023		dst->private_data = &event->private_data;
2024		memcpy(&event->private_data, src->private_data,
2025		       src->private_data_len);
2026	}
2027
2028	dst->responder_resources = src->responder_resources;
2029	dst->initiator_depth = src->initiator_depth;
2030	dst->flow_control = src->flow_control;
2031	dst->retry_count = src->retry_count;
2032	dst->rnr_retry_count = src->rnr_retry_count;
2033	dst->srq = src->srq;
2034	dst->qp_num = src->qp_num;
2035}
2036
2037static void ucma_copy_ud_event(struct cma_event *event,
2038			       struct ucma_abi_ud_param *src)
2039{
2040	struct rdma_ud_param *dst = &event->event.param.ud;
2041
2042	dst->private_data_len = src->private_data_len;
2043	if (src->private_data_len) {
2044		dst->private_data = &event->private_data;
2045		memcpy(&event->private_data, src->private_data,
2046		       src->private_data_len);
2047	}
2048
2049	ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
2050	dst->qp_num = src->qp_num;
2051	dst->qkey = src->qkey;
2052}
2053
2054int rdma_get_cm_event(struct rdma_event_channel *channel,
2055		      struct rdma_cm_event **event)
2056{
2057	struct ucma_abi_event_resp resp;
2058	struct ucma_abi_get_event cmd;
2059	struct cma_event *evt;
2060	int ret;
2061
2062	ret = ucma_init();
2063	if (ret)
2064		return ret;
2065
2066	if (!event)
2067		return ERR(EINVAL);
2068
2069	evt = malloc(sizeof(*evt));
2070	if (!evt)
2071		return ERR(ENOMEM);
2072
2073retry:
2074	memset(evt, 0, sizeof(*evt));
2075	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp);
2076	ret = write(channel->fd, &cmd, sizeof cmd);
2077	if (ret != sizeof cmd) {
2078		free(evt);
2079		return (ret >= 0) ? ERR(ENODATA) : -1;
2080	}
2081
2082	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2083
2084	evt->event.event = resp.event;
2085	/*
2086	 * We should have a non-zero uid, except for connection requests.
2087	 * But a bug in older kernels can report a uid 0.  Work-around this
2088	 * issue by looking up the cma_id based on the kernel's id when the
2089	 * uid is 0 and we're processing a connection established event.
2090	 * In all other cases, if the uid is 0, we discard the event, like
2091	 * the kernel should have done.
2092	 */
2093	if (resp.uid) {
2094		evt->id_priv = (void *) (uintptr_t) resp.uid;
2095	} else {
2096		evt->id_priv = ucma_lookup_id(resp.id);
2097		if (!evt->id_priv) {
2098			syslog(LOG_WARNING, PFX "Warning: discarding unmatched "
2099				"event - rdma_destroy_id may hang.\n");
2100			goto retry;
2101		}
2102		if (resp.event != RDMA_CM_EVENT_ESTABLISHED) {
2103			ucma_complete_event(evt->id_priv);
2104			goto retry;
2105		}
2106	}
2107	evt->event.id = &evt->id_priv->id;
2108	evt->event.status = resp.status;
2109
2110	switch (resp.event) {
2111	case RDMA_CM_EVENT_ADDR_RESOLVED:
2112		ucma_process_addr_resolved(evt);
2113		break;
2114	case RDMA_CM_EVENT_ROUTE_RESOLVED:
2115		ucma_process_route_resolved(evt);
2116		break;
2117	case RDMA_CM_EVENT_CONNECT_REQUEST:
2118		evt->id_priv = (void *) (uintptr_t) resp.uid;
2119		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2120			ucma_copy_ud_event(evt, &resp.param.ud);
2121		else
2122			ucma_copy_conn_event(evt, &resp.param.conn);
2123
2124		ret = ucma_process_conn_req(evt, resp.id);
2125		if (ret)
2126			goto retry;
2127		break;
2128	case RDMA_CM_EVENT_CONNECT_RESPONSE:
2129		ucma_copy_conn_event(evt, &resp.param.conn);
2130		evt->event.status = ucma_process_conn_resp(evt->id_priv);
2131		if (!evt->event.status)
2132			evt->event.event = RDMA_CM_EVENT_ESTABLISHED;
2133		else {
2134			evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
2135			evt->id_priv->connect_error = 1;
2136		}
2137		break;
2138	case RDMA_CM_EVENT_ESTABLISHED:
2139		if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) {
2140			ucma_copy_ud_event(evt, &resp.param.ud);
2141			break;
2142		}
2143
2144		ucma_copy_conn_event(evt, &resp.param.conn);
2145		break;
2146	case RDMA_CM_EVENT_REJECTED:
2147		if (evt->id_priv->connect_error) {
2148			ucma_complete_event(evt->id_priv);
2149			goto retry;
2150		}
2151		ucma_copy_conn_event(evt, &resp.param.conn);
2152		ucma_modify_qp_err(evt->event.id);
2153		break;
2154	case RDMA_CM_EVENT_DISCONNECTED:
2155		if (evt->id_priv->connect_error) {
2156			ucma_complete_event(evt->id_priv);
2157			goto retry;
2158		}
2159		ucma_copy_conn_event(evt, &resp.param.conn);
2160		break;
2161	case RDMA_CM_EVENT_MULTICAST_JOIN:
2162		evt->mc = (void *) (uintptr_t) resp.uid;
2163		evt->id_priv = evt->mc->id_priv;
2164		evt->event.id = &evt->id_priv->id;
2165		ucma_copy_ud_event(evt, &resp.param.ud);
2166		evt->event.param.ud.private_data = evt->mc->context;
2167		evt->event.status = ucma_process_join(evt);
2168		if (evt->event.status)
2169			evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
2170		break;
2171	case RDMA_CM_EVENT_MULTICAST_ERROR:
2172		evt->mc = (void *) (uintptr_t) resp.uid;
2173		evt->id_priv = evt->mc->id_priv;
2174		evt->event.id = &evt->id_priv->id;
2175		evt->event.param.ud.private_data = evt->mc->context;
2176		break;
2177	default:
2178		evt->id_priv = (void *) (uintptr_t) resp.uid;
2179		evt->event.id = &evt->id_priv->id;
2180		evt->event.status = resp.status;
2181		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2182			ucma_copy_ud_event(evt, &resp.param.ud);
2183		else
2184			ucma_copy_conn_event(evt, &resp.param.conn);
2185		break;
2186	}
2187
2188	*event = &evt->event;
2189	return 0;
2190}
2191
2192const char *rdma_event_str(enum rdma_cm_event_type event)
2193{
2194	switch (event) {
2195	case RDMA_CM_EVENT_ADDR_RESOLVED:
2196		return "RDMA_CM_EVENT_ADDR_RESOLVED";
2197	case RDMA_CM_EVENT_ADDR_ERROR:
2198		return "RDMA_CM_EVENT_ADDR_ERROR";
2199	case RDMA_CM_EVENT_ROUTE_RESOLVED:
2200		return "RDMA_CM_EVENT_ROUTE_RESOLVED";
2201	case RDMA_CM_EVENT_ROUTE_ERROR:
2202		return "RDMA_CM_EVENT_ROUTE_ERROR";
2203	case RDMA_CM_EVENT_CONNECT_REQUEST:
2204		return "RDMA_CM_EVENT_CONNECT_REQUEST";
2205	case RDMA_CM_EVENT_CONNECT_RESPONSE:
2206		return "RDMA_CM_EVENT_CONNECT_RESPONSE";
2207	case RDMA_CM_EVENT_CONNECT_ERROR:
2208		return "RDMA_CM_EVENT_CONNECT_ERROR";
2209	case RDMA_CM_EVENT_UNREACHABLE:
2210		return "RDMA_CM_EVENT_UNREACHABLE";
2211	case RDMA_CM_EVENT_REJECTED:
2212		return "RDMA_CM_EVENT_REJECTED";
2213	case RDMA_CM_EVENT_ESTABLISHED:
2214		return "RDMA_CM_EVENT_ESTABLISHED";
2215	case RDMA_CM_EVENT_DISCONNECTED:
2216		return "RDMA_CM_EVENT_DISCONNECTED";
2217	case RDMA_CM_EVENT_DEVICE_REMOVAL:
2218		return "RDMA_CM_EVENT_DEVICE_REMOVAL";
2219	case RDMA_CM_EVENT_MULTICAST_JOIN:
2220		return "RDMA_CM_EVENT_MULTICAST_JOIN";
2221	case RDMA_CM_EVENT_MULTICAST_ERROR:
2222		return "RDMA_CM_EVENT_MULTICAST_ERROR";
2223	case RDMA_CM_EVENT_ADDR_CHANGE:
2224		return "RDMA_CM_EVENT_ADDR_CHANGE";
2225	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
2226		return "RDMA_CM_EVENT_TIMEWAIT_EXIT";
2227	default:
2228		return "UNKNOWN EVENT";
2229	}
2230}
2231
2232int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
2233		    void *optval, size_t optlen)
2234{
2235	struct ucma_abi_set_option cmd;
2236	struct cma_id_private *id_priv;
2237	int ret;
2238
2239	CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION);
2240	id_priv = container_of(id, struct cma_id_private, id);
2241	cmd.id = id_priv->handle;
2242	cmd.optval = (uintptr_t) optval;
2243	cmd.level = level;
2244	cmd.optname = optname;
2245	cmd.optlen = optlen;
2246
2247	ret = write(id->channel->fd, &cmd, sizeof cmd);
2248	if (ret != sizeof cmd)
2249		return (ret >= 0) ? ERR(ENODATA) : -1;
2250
2251	return 0;
2252}
2253
2254int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
2255{
2256	struct ucma_abi_migrate_resp resp;
2257	struct ucma_abi_migrate_id cmd;
2258	struct cma_id_private *id_priv;
2259	int ret, sync;
2260
2261	id_priv = container_of(id, struct cma_id_private, id);
2262	if (id_priv->sync && !channel)
2263		return ERR(EINVAL);
2264
2265	if ((sync = (channel == NULL))) {
2266		channel = rdma_create_event_channel();
2267		if (!channel)
2268			return -1;
2269	}
2270
2271	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp);
2272	cmd.id = id_priv->handle;
2273	cmd.fd = id->channel->fd;
2274
2275	ret = write(channel->fd, &cmd, sizeof cmd);
2276	if (ret != sizeof cmd) {
2277		if (sync)
2278			rdma_destroy_event_channel(channel);
2279		return (ret >= 0) ? ERR(ENODATA) : -1;
2280	}
2281
2282	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2283
2284	if (id_priv->sync) {
2285		if (id->event) {
2286			rdma_ack_cm_event(id->event);
2287			id->event = NULL;
2288		}
2289		rdma_destroy_event_channel(id->channel);
2290	}
2291
2292	/*
2293	 * Eventually if we want to support migrating channels while events are
2294	 * being processed on the current channel, we need to block here while
2295	 * there are any outstanding events on the current channel for this id
2296	 * to prevent the user from processing events for this id on the old
2297	 * channel after this call returns.
2298	 */
2299	pthread_mutex_lock(&id_priv->mut);
2300	id_priv->sync = sync;
2301	id->channel = channel;
2302	while (id_priv->events_completed < resp.events_reported)
2303		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
2304	pthread_mutex_unlock(&id_priv->mut);
2305
2306	return 0;
2307}
2308
2309static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res,
2310			   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2311{
2312	struct cma_id_private *id_priv;
2313	int ret;
2314
2315	if (af_ib_support)
2316		ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len);
2317	else
2318		ret = rdma_bind_addr(id, res->ai_src_addr);
2319	if (ret)
2320		return ret;
2321
2322	id_priv = container_of(id, struct cma_id_private, id);
2323	if (pd)
2324		id->pd = pd;
2325
2326	if (qp_init_attr) {
2327		id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr));
2328		if (!id_priv->qp_init_attr)
2329			return ERR(ENOMEM);
2330
2331		*id_priv->qp_init_attr = *qp_init_attr;
2332		id_priv->qp_init_attr->qp_type = res->ai_qp_type;
2333	}
2334
2335	return 0;
2336}
2337
2338int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res,
2339		   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2340{
2341	struct rdma_cm_id *cm_id;
2342	struct cma_id_private *id_priv;
2343	int ret;
2344
2345	ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type);
2346	if (ret)
2347		return ret;
2348
2349	if (res->ai_flags & RAI_PASSIVE) {
2350		ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr);
2351		if (ret)
2352			goto err;
2353		goto out;
2354	}
2355
2356	if (af_ib_support)
2357		ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len,
2358					 res->ai_dst_addr, res->ai_dst_len, 2000);
2359	else
2360		ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000);
2361	if (ret)
2362		goto err;
2363
2364	if (res->ai_route_len) {
2365		ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
2366				      res->ai_route, res->ai_route_len);
2367		if (!ret)
2368			ret = ucma_complete(cm_id);
2369	} else {
2370		ret = rdma_resolve_route(cm_id, 2000);
2371	}
2372	if (ret)
2373		goto err;
2374
2375	if (qp_init_attr) {
2376		qp_init_attr->qp_type = res->ai_qp_type;
2377		ret = rdma_create_qp(cm_id, pd, qp_init_attr);
2378		if (ret)
2379			goto err;
2380	}
2381
2382	if (res->ai_connect_len) {
2383		id_priv = container_of(cm_id, struct cma_id_private, id);
2384		id_priv->connect = malloc(res->ai_connect_len);
2385		if (!id_priv->connect) {
2386			ret = ERR(ENOMEM);
2387			goto err;
2388		}
2389		memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len);
2390		id_priv->connect_len = res->ai_connect_len;
2391	}
2392
2393out:
2394	*id = cm_id;
2395	return 0;
2396
2397err:
2398	rdma_destroy_ep(cm_id);
2399	return ret;
2400}
2401
2402void rdma_destroy_ep(struct rdma_cm_id *id)
2403{
2404	struct cma_id_private *id_priv;
2405
2406	if (id->qp)
2407		rdma_destroy_qp(id);
2408
2409	if (id->srq)
2410		rdma_destroy_srq(id);
2411
2412	id_priv = container_of(id, struct cma_id_private, id);
2413	if (id_priv->qp_init_attr)
2414		free(id_priv->qp_init_attr);
2415
2416	rdma_destroy_id(id);
2417}
2418
2419int ucma_max_qpsize(struct rdma_cm_id *id)
2420{
2421	struct cma_id_private *id_priv;
2422	int i, max_size = 0;
2423
2424	id_priv = container_of(id, struct cma_id_private, id);
2425	if (id && id_priv->cma_dev) {
2426		max_size = id_priv->cma_dev->max_qpsize;
2427	} else {
2428		ucma_init_all();
2429		for (i = 0; i < cma_dev_cnt; i++) {
2430			if (!max_size || max_size > cma_dev_array[i].max_qpsize)
2431				max_size = cma_dev_array[i].max_qpsize;
2432		}
2433	}
2434	return max_size;
2435}
2436
2437__be16 ucma_get_port(struct sockaddr *addr)
2438{
2439	switch (addr->sa_family) {
2440	case AF_INET:
2441		return ((struct sockaddr_in *) addr)->sin_port;
2442	case AF_INET6:
2443		return ((struct sockaddr_in6 *) addr)->sin6_port;
2444	case AF_IB:
2445		return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid));
2446	default:
2447		return 0;
2448	}
2449}
2450
2451__be16 rdma_get_src_port(struct rdma_cm_id *id)
2452{
2453	return ucma_get_port(&id->route.addr.src_addr);
2454}
2455
2456__be16 rdma_get_dst_port(struct rdma_cm_id *id)
2457{
2458	return ucma_get_port(&id->route.addr.dst_addr);
2459}
2460
2461