verbs.c revision 331769
1/*
2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#define _GNU_SOURCE
35#include <config.h>
36
37#include <infiniband/endian.h>
38#include <stdio.h>
39#include <unistd.h>
40#include <stdlib.h>
41#include <errno.h>
42#include <string.h>
43#include <dirent.h>
44#include <netinet/in.h>
45#include <netinet/ip.h>
46#include <sys/socket.h>
47
48#include "ibverbs.h"
49#ifndef NRESOLVE_NEIGH
50#include <net/if.h>
51#include <net/if_arp.h>
52#include "neigh.h"
53#endif
54
55/* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse
56   with these prototypes. Symbol versionining requires the goofy names, the
57   prototype must match the version in verbs.h.
58 */
59int __ibv_query_device(struct ibv_context *context,
60		       struct ibv_device_attr *device_attr);
61int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
62		     struct ibv_port_attr *port_attr);
63int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index,
64		    union ibv_gid *gid);
65int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index,
66		     __be16 *pkey);
67struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context);
68int __ibv_dealloc_pd(struct ibv_pd *pd);
69struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
70			    int access);
71int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr,
72		   size_t length, int access);
73int __ibv_dereg_mr(struct ibv_mr *mr);
74struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe,
75			       void *cq_context,
76			       struct ibv_comp_channel *channel,
77			       int comp_vector);
78int __ibv_resize_cq(struct ibv_cq *cq, int cqe);
79int __ibv_destroy_cq(struct ibv_cq *cq);
80int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq,
81		       void **cq_context);
82void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents);
83struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
84				 struct ibv_srq_init_attr *srq_init_attr);
85int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr,
86		     int srq_attr_mask);
87int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr);
88int __ibv_destroy_srq(struct ibv_srq *srq);
89struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
90			       struct ibv_qp_init_attr *qp_init_attr);
91int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
92		   struct ibv_qp_init_attr *init_attr);
93int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
94int __ibv_destroy_qp(struct ibv_qp *qp);
95struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
96int __ibv_destroy_ah(struct ibv_ah *ah);
97int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
98		       uint16_t lid);
99int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
100		       uint16_t lid);
101
102int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate)
103{
104	switch (rate) {
105	case IBV_RATE_2_5_GBPS: return  1;
106	case IBV_RATE_5_GBPS:   return  2;
107	case IBV_RATE_10_GBPS:  return  4;
108	case IBV_RATE_20_GBPS:  return  8;
109	case IBV_RATE_30_GBPS:  return 12;
110	case IBV_RATE_40_GBPS:  return 16;
111	case IBV_RATE_60_GBPS:  return 24;
112	case IBV_RATE_80_GBPS:  return 32;
113	case IBV_RATE_120_GBPS: return 48;
114	default:           return -1;
115	}
116}
117
118enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult)
119{
120	switch (mult) {
121	case 1:  return IBV_RATE_2_5_GBPS;
122	case 2:  return IBV_RATE_5_GBPS;
123	case 4:  return IBV_RATE_10_GBPS;
124	case 8:  return IBV_RATE_20_GBPS;
125	case 12: return IBV_RATE_30_GBPS;
126	case 16: return IBV_RATE_40_GBPS;
127	case 24: return IBV_RATE_60_GBPS;
128	case 32: return IBV_RATE_80_GBPS;
129	case 48: return IBV_RATE_120_GBPS;
130	default: return IBV_RATE_MAX;
131	}
132}
133
134int  __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate)
135{
136	switch (rate) {
137	case IBV_RATE_2_5_GBPS: return 2500;
138	case IBV_RATE_5_GBPS:   return 5000;
139	case IBV_RATE_10_GBPS:  return 10000;
140	case IBV_RATE_20_GBPS:  return 20000;
141	case IBV_RATE_30_GBPS:  return 30000;
142	case IBV_RATE_40_GBPS:  return 40000;
143	case IBV_RATE_60_GBPS:  return 60000;
144	case IBV_RATE_80_GBPS:  return 80000;
145	case IBV_RATE_120_GBPS: return 120000;
146	case IBV_RATE_14_GBPS:  return 14062;
147	case IBV_RATE_56_GBPS:  return 56250;
148	case IBV_RATE_112_GBPS: return 112500;
149	case IBV_RATE_168_GBPS: return 168750;
150	case IBV_RATE_25_GBPS:  return 25781;
151	case IBV_RATE_100_GBPS: return 103125;
152	case IBV_RATE_200_GBPS: return 206250;
153	case IBV_RATE_300_GBPS: return 309375;
154	default:               return -1;
155	}
156}
157
158enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps)
159{
160	switch (mbps) {
161	case 2500:   return IBV_RATE_2_5_GBPS;
162	case 5000:   return IBV_RATE_5_GBPS;
163	case 10000:  return IBV_RATE_10_GBPS;
164	case 20000:  return IBV_RATE_20_GBPS;
165	case 30000:  return IBV_RATE_30_GBPS;
166	case 40000:  return IBV_RATE_40_GBPS;
167	case 60000:  return IBV_RATE_60_GBPS;
168	case 80000:  return IBV_RATE_80_GBPS;
169	case 120000: return IBV_RATE_120_GBPS;
170	case 14062:  return IBV_RATE_14_GBPS;
171	case 56250:  return IBV_RATE_56_GBPS;
172	case 112500: return IBV_RATE_112_GBPS;
173	case 168750: return IBV_RATE_168_GBPS;
174	case 25781:  return IBV_RATE_25_GBPS;
175	case 103125: return IBV_RATE_100_GBPS;
176	case 206250: return IBV_RATE_200_GBPS;
177	case 309375: return IBV_RATE_300_GBPS;
178	default:     return IBV_RATE_MAX;
179	}
180}
181
182int __ibv_query_device(struct ibv_context *context,
183		       struct ibv_device_attr *device_attr)
184{
185	return context->ops.query_device(context, device_attr);
186}
187default_symver(__ibv_query_device, ibv_query_device);
188
189int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
190		     struct ibv_port_attr *port_attr)
191{
192	return context->ops.query_port(context, port_num, port_attr);
193}
194default_symver(__ibv_query_port, ibv_query_port);
195
196int __ibv_query_gid(struct ibv_context *context, uint8_t port_num,
197		    int index, union ibv_gid *gid)
198{
199	char name[24];
200	char attr[41];
201	uint16_t val;
202	int i;
203
204	snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index);
205
206	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
207				attr, sizeof attr) < 0)
208		return -1;
209
210	for (i = 0; i < 8; ++i) {
211		if (sscanf(attr + i * 5, "%hx", &val) != 1)
212			return -1;
213		gid->raw[i * 2    ] = val >> 8;
214		gid->raw[i * 2 + 1] = val & 0xff;
215	}
216
217	return 0;
218}
219default_symver(__ibv_query_gid, ibv_query_gid);
220
221int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
222		     int index, __be16 *pkey)
223{
224	char name[24];
225	char attr[8];
226	uint16_t val;
227
228	snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index);
229
230	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
231				attr, sizeof attr) < 0)
232		return -1;
233
234	if (sscanf(attr, "%hx", &val) != 1)
235		return -1;
236
237	*pkey = htobe16(val);
238	return 0;
239}
240default_symver(__ibv_query_pkey, ibv_query_pkey);
241
242struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context)
243{
244	struct ibv_pd *pd;
245
246	pd = context->ops.alloc_pd(context);
247	if (pd)
248		pd->context = context;
249
250	return pd;
251}
252default_symver(__ibv_alloc_pd, ibv_alloc_pd);
253
254int __ibv_dealloc_pd(struct ibv_pd *pd)
255{
256	return pd->context->ops.dealloc_pd(pd);
257}
258default_symver(__ibv_dealloc_pd, ibv_dealloc_pd);
259
260struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr,
261			    size_t length, int access)
262{
263	struct ibv_mr *mr;
264
265	if (ibv_dontfork_range(addr, length))
266		return NULL;
267
268	mr = pd->context->ops.reg_mr(pd, addr, length, access);
269	if (mr) {
270		mr->context = pd->context;
271		mr->pd      = pd;
272		mr->addr    = addr;
273		mr->length  = length;
274	} else
275		ibv_dofork_range(addr, length);
276
277	return mr;
278}
279default_symver(__ibv_reg_mr, ibv_reg_mr);
280
281int __ibv_rereg_mr(struct ibv_mr *mr, int flags,
282		   struct ibv_pd *pd, void *addr,
283		   size_t length, int access)
284{
285	int dofork_onfail = 0;
286	int err;
287	void *old_addr;
288	size_t old_len;
289
290	if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) {
291		errno = EINVAL;
292		return IBV_REREG_MR_ERR_INPUT;
293	}
294
295	if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) &&
296	    (!length || !addr)) {
297		errno = EINVAL;
298		return IBV_REREG_MR_ERR_INPUT;
299	}
300
301	if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) {
302		errno = EINVAL;
303		return IBV_REREG_MR_ERR_INPUT;
304	}
305
306	if (!mr->context->ops.rereg_mr) {
307		errno = ENOSYS;
308		return IBV_REREG_MR_ERR_INPUT;
309	}
310
311	if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
312		err = ibv_dontfork_range(addr, length);
313		if (err)
314			return IBV_REREG_MR_ERR_DONT_FORK_NEW;
315		dofork_onfail = 1;
316	}
317
318	old_addr = mr->addr;
319	old_len = mr->length;
320	err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access);
321	if (!err) {
322		if (flags & IBV_REREG_MR_CHANGE_PD)
323			mr->pd = pd;
324		if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
325			mr->addr    = addr;
326			mr->length  = length;
327			err = ibv_dofork_range(old_addr, old_len);
328			if (err)
329				return IBV_REREG_MR_ERR_DO_FORK_OLD;
330		}
331	} else {
332		err = IBV_REREG_MR_ERR_CMD;
333		if (dofork_onfail) {
334			if (ibv_dofork_range(addr, length))
335				err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW;
336		}
337	}
338
339	return err;
340}
341default_symver(__ibv_rereg_mr, ibv_rereg_mr);
342
343int __ibv_dereg_mr(struct ibv_mr *mr)
344{
345	int ret;
346	void *addr	= mr->addr;
347	size_t length	= mr->length;
348
349	ret = mr->context->ops.dereg_mr(mr);
350	if (!ret)
351		ibv_dofork_range(addr, length);
352
353	return ret;
354}
355default_symver(__ibv_dereg_mr, ibv_dereg_mr);
356
357static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
358{
359	struct ibv_abi_compat_v2 *t = context->abi_compat;
360	static int warned;
361
362	if (!pthread_mutex_trylock(&t->in_use))
363		return &t->channel;
364
365	if (!warned) {
366		fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n"
367			"    Only one completion channel can be created per context.\n",
368			abi_ver);
369		++warned;
370	}
371
372	return NULL;
373}
374
375struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
376{
377	struct ibv_comp_channel            *channel;
378	struct ibv_create_comp_channel      cmd;
379	struct ibv_create_comp_channel_resp resp;
380
381	if (abi_ver <= 2)
382		return ibv_create_comp_channel_v2(context);
383
384	channel = malloc(sizeof *channel);
385	if (!channel)
386		return NULL;
387
388	IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp);
389	if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) {
390		free(channel);
391		return NULL;
392	}
393
394	(void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
395
396	channel->context = context;
397	channel->fd      = resp.fd;
398	channel->refcnt  = 0;
399
400	return channel;
401}
402
403static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel)
404{
405	struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel;
406	pthread_mutex_unlock(&t->in_use);
407	return 0;
408}
409
410int ibv_destroy_comp_channel(struct ibv_comp_channel *channel)
411{
412	struct ibv_context *context;
413	int ret;
414
415	context = channel->context;
416	pthread_mutex_lock(&context->mutex);
417
418	if (channel->refcnt) {
419		ret = EBUSY;
420		goto out;
421	}
422
423	if (abi_ver <= 2) {
424		ret = ibv_destroy_comp_channel_v2(channel);
425		goto out;
426	}
427
428	close(channel->fd);
429	free(channel);
430	ret = 0;
431
432out:
433	pthread_mutex_unlock(&context->mutex);
434
435	return ret;
436}
437
438struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context,
439			       struct ibv_comp_channel *channel, int comp_vector)
440{
441	struct ibv_cq *cq;
442
443	cq = context->ops.create_cq(context, cqe, channel, comp_vector);
444
445	if (cq)
446		verbs_init_cq(cq, context, channel, cq_context);
447
448	return cq;
449}
450default_symver(__ibv_create_cq, ibv_create_cq);
451
452int __ibv_resize_cq(struct ibv_cq *cq, int cqe)
453{
454	if (!cq->context->ops.resize_cq)
455		return ENOSYS;
456
457	return cq->context->ops.resize_cq(cq, cqe);
458}
459default_symver(__ibv_resize_cq, ibv_resize_cq);
460
461int __ibv_destroy_cq(struct ibv_cq *cq)
462{
463	struct ibv_comp_channel *channel = cq->channel;
464	int ret;
465
466	ret = cq->context->ops.destroy_cq(cq);
467
468	if (channel) {
469		if (!ret) {
470			pthread_mutex_lock(&channel->context->mutex);
471			--channel->refcnt;
472			pthread_mutex_unlock(&channel->context->mutex);
473		}
474	}
475
476	return ret;
477}
478default_symver(__ibv_destroy_cq, ibv_destroy_cq);
479
480int __ibv_get_cq_event(struct ibv_comp_channel *channel,
481		       struct ibv_cq **cq, void **cq_context)
482{
483	struct ibv_comp_event ev;
484
485	if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
486		return -1;
487
488	*cq         = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
489	*cq_context = (*cq)->cq_context;
490
491	if ((*cq)->context->ops.cq_event)
492		(*cq)->context->ops.cq_event(*cq);
493
494	return 0;
495}
496default_symver(__ibv_get_cq_event, ibv_get_cq_event);
497
498void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
499{
500	pthread_mutex_lock(&cq->mutex);
501	cq->comp_events_completed += nevents;
502	pthread_cond_signal(&cq->cond);
503	pthread_mutex_unlock(&cq->mutex);
504}
505default_symver(__ibv_ack_cq_events, ibv_ack_cq_events);
506
507struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
508				 struct ibv_srq_init_attr *srq_init_attr)
509{
510	struct ibv_srq *srq;
511
512	if (!pd->context->ops.create_srq)
513		return NULL;
514
515	srq = pd->context->ops.create_srq(pd, srq_init_attr);
516	if (srq) {
517		srq->context          = pd->context;
518		srq->srq_context      = srq_init_attr->srq_context;
519		srq->pd               = pd;
520		srq->events_completed = 0;
521		pthread_mutex_init(&srq->mutex, NULL);
522		pthread_cond_init(&srq->cond, NULL);
523	}
524
525	return srq;
526}
527default_symver(__ibv_create_srq, ibv_create_srq);
528
529int __ibv_modify_srq(struct ibv_srq *srq,
530		     struct ibv_srq_attr *srq_attr,
531		     int srq_attr_mask)
532{
533	return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask);
534}
535default_symver(__ibv_modify_srq, ibv_modify_srq);
536
537int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr)
538{
539	return srq->context->ops.query_srq(srq, srq_attr);
540}
541default_symver(__ibv_query_srq, ibv_query_srq);
542
543int __ibv_destroy_srq(struct ibv_srq *srq)
544{
545	return srq->context->ops.destroy_srq(srq);
546}
547default_symver(__ibv_destroy_srq, ibv_destroy_srq);
548
549struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
550			       struct ibv_qp_init_attr *qp_init_attr)
551{
552	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr);
553
554	if (qp) {
555		qp->context    	     = pd->context;
556		qp->qp_context 	     = qp_init_attr->qp_context;
557		qp->pd         	     = pd;
558		qp->send_cq    	     = qp_init_attr->send_cq;
559		qp->recv_cq    	     = qp_init_attr->recv_cq;
560		qp->srq        	     = qp_init_attr->srq;
561		qp->qp_type          = qp_init_attr->qp_type;
562		qp->state	     = IBV_QPS_RESET;
563		qp->events_completed = 0;
564		pthread_mutex_init(&qp->mutex, NULL);
565		pthread_cond_init(&qp->cond, NULL);
566	}
567
568	return qp;
569}
570default_symver(__ibv_create_qp, ibv_create_qp);
571
572int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
573		   int attr_mask,
574		   struct ibv_qp_init_attr *init_attr)
575{
576	int ret;
577
578	ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr);
579	if (ret)
580		return ret;
581
582	if (attr_mask & IBV_QP_STATE)
583		qp->state = attr->qp_state;
584
585	return 0;
586}
587default_symver(__ibv_query_qp, ibv_query_qp);
588
589int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
590		    int attr_mask)
591{
592	int ret;
593
594	ret = qp->context->ops.modify_qp(qp, attr, attr_mask);
595	if (ret)
596		return ret;
597
598	if (attr_mask & IBV_QP_STATE)
599		qp->state = attr->qp_state;
600
601	return 0;
602}
603default_symver(__ibv_modify_qp, ibv_modify_qp);
604
605int __ibv_destroy_qp(struct ibv_qp *qp)
606{
607	return qp->context->ops.destroy_qp(qp);
608}
609default_symver(__ibv_destroy_qp, ibv_destroy_qp);
610
611struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
612{
613	struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr);
614
615	if (ah) {
616		ah->context = pd->context;
617		ah->pd      = pd;
618	}
619
620	return ah;
621}
622default_symver(__ibv_create_ah, ibv_create_ah);
623
624/* GID types as appear in sysfs, no change is expected as of ABI
625 * compatibility.
626 */
627#define V1_TYPE "IB/RoCE v1"
628#define V2_TYPE "RoCE v2"
629int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num,
630		       unsigned int index, enum ibv_gid_type *type)
631{
632	char name[32];
633	char buff[11];
634
635	snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num,
636		 index);
637
638	/* Reset errno so that we can rely on its value upon any error flow in
639	 * ibv_read_sysfs_file.
640	 */
641	errno = 0;
642	if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff,
643				sizeof(buff)) <= 0) {
644		char *dir_path;
645		DIR *dir;
646
647		if (errno == EINVAL) {
648			/* In IB, this file doesn't exist and the kernel sets
649			 * errno to -EINVAL.
650			 */
651			*type = IBV_GID_TYPE_IB_ROCE_V1;
652			return 0;
653		}
654		if (asprintf(&dir_path, "%s/%s/%d/%s/",
655			     context->device->ibdev_path, "ports", port_num,
656			     "gid_attrs") < 0)
657			return -1;
658		dir = opendir(dir_path);
659		free(dir_path);
660		if (!dir) {
661			if (errno == ENOENT)
662				/* Assuming that if gid_attrs doesn't exist,
663				 * we have an old kernel and all GIDs are
664				 * IB/RoCE v1
665				 */
666				*type = IBV_GID_TYPE_IB_ROCE_V1;
667			else
668				return -1;
669		} else {
670			closedir(dir);
671			errno = EFAULT;
672			return -1;
673		}
674	} else {
675		if (!strcmp(buff, V1_TYPE)) {
676			*type = IBV_GID_TYPE_IB_ROCE_V1;
677		} else if (!strcmp(buff, V2_TYPE)) {
678			*type = IBV_GID_TYPE_ROCE_V2;
679		} else {
680			errno = ENOTSUP;
681			return -1;
682		}
683	}
684
685	return 0;
686}
687
688static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num,
689			      union ibv_gid *gid, enum ibv_gid_type gid_type)
690{
691	enum ibv_gid_type sgid_type = 0;
692	union ibv_gid sgid;
693	int i = 0, ret;
694
695	do {
696		ret = ibv_query_gid(context, port_num, i, &sgid);
697		if (!ret) {
698			ret = ibv_query_gid_type(context, port_num, i,
699						 &sgid_type);
700		}
701		i++;
702	} while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) ||
703		 (gid_type != sgid_type)));
704
705	return ret ? ret : i - 1;
706}
707
708static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
709{
710	ipv6->s6_addr32[0] = 0;
711	ipv6->s6_addr32[1] = 0;
712	ipv6->s6_addr32[2] = htobe32(0x0000FFFF);
713	ipv6->s6_addr32[3] = ipv4;
714}
715
716static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords)
717{
718	unsigned int i = 0;
719	uint32_t sum = 0;
720
721	for (i = 0; i < num_hwords; i++)
722		sum += *(data++);
723
724	sum = (sum & 0xffff) + (sum >> 16);
725
726	return (__sum16)~sum;
727}
728
729static inline int get_grh_header_version(struct ibv_grh *grh)
730{
731	int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf;
732	struct ip *ip4h = (struct ip *)((void *)grh + 20);
733	struct ip ip4h_checked;
734
735	if (ip6h_version != 6) {
736		if (ip4h->ip_v == 4)
737			return 4;
738		errno = EPROTONOSUPPORT;
739		return -1;
740	}
741	/* version may be 6 or 4 */
742	if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */
743		return 6;
744	/*
745	* Verify checksum.
746	* We can't write on scattered buffers so we have to copy to temp
747	* buffer.
748	*/
749	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
750	/* Need to set the checksum field (check) to 0 before re-calculating
751	 * the checksum.
752	 */
753	ip4h_checked.ip_sum = 0;
754	ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10);
755	/* if IPv4 header checksum is OK, believe it */
756	if (ip4h->ip_sum == ip4h_checked.ip_sum)
757		return 4;
758	return 6;
759}
760
761static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr,
762					      struct ibv_wc *wc,
763					      struct ibv_grh *grh,
764					      uint8_t port_num)
765{
766	uint32_t flow_class;
767
768	flow_class = be32toh(grh->version_tclass_flow);
769	ah_attr->grh.flow_label = flow_class & 0xFFFFF;
770	ah_attr->dlid = wc->slid;
771	ah_attr->sl = wc->sl;
772	ah_attr->src_path_bits = wc->dlid_path_bits;
773	ah_attr->port_num = port_num;
774}
775
776static inline int set_ah_attr_by_ipv4(struct ibv_context *context,
777				      struct ibv_ah_attr *ah_attr,
778				      struct ip *ip4h, uint8_t port_num)
779{
780	union ibv_gid sgid;
781	int ret;
782
783	/* No point searching multicast GIDs in GID table */
784	if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) {
785		errno = EINVAL;
786		return -1;
787	}
788
789	map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid);
790	ret = ibv_find_gid_index(context, port_num, &sgid,
791				 IBV_GID_TYPE_ROCE_V2);
792	if (ret < 0)
793		return ret;
794
795	map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr,
796			      (struct in6_addr *)&ah_attr->grh.dgid);
797	ah_attr->grh.sgid_index = (uint8_t) ret;
798	ah_attr->grh.hop_limit = ip4h->ip_ttl;
799	ah_attr->grh.traffic_class = ip4h->ip_tos;
800
801	return 0;
802}
803
804#define IB_NEXT_HDR    0x1b
805static inline int set_ah_attr_by_ipv6(struct ibv_context *context,
806				  struct ibv_ah_attr *ah_attr,
807				  struct ibv_grh *grh, uint8_t port_num)
808{
809	uint32_t flow_class;
810	uint32_t sgid_type;
811	int ret;
812
813	/* No point searching multicast GIDs in GID table */
814	if (grh->dgid.raw[0] == 0xFF) {
815		errno = EINVAL;
816		return -1;
817	}
818
819	ah_attr->grh.dgid = grh->sgid;
820	if (grh->next_hdr == IPPROTO_UDP) {
821		sgid_type = IBV_GID_TYPE_ROCE_V2;
822	} else if (grh->next_hdr == IB_NEXT_HDR) {
823		sgid_type = IBV_GID_TYPE_IB_ROCE_V1;
824	} else {
825		errno = EPROTONOSUPPORT;
826		return -1;
827	}
828
829	ret = ibv_find_gid_index(context, port_num, &grh->dgid,
830				 sgid_type);
831	if (ret < 0)
832		return ret;
833
834	ah_attr->grh.sgid_index = (uint8_t) ret;
835	flow_class = be32toh(grh->version_tclass_flow);
836	ah_attr->grh.hop_limit = grh->hop_limit;
837	ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
838
839	return 0;
840}
841
842int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
843			struct ibv_wc *wc, struct ibv_grh *grh,
844			struct ibv_ah_attr *ah_attr)
845{
846	int version;
847	int ret = 0;
848
849	memset(ah_attr, 0, sizeof *ah_attr);
850	set_ah_attr_generic_fields(ah_attr, wc, grh, port_num);
851
852	if (wc->wc_flags & IBV_WC_GRH) {
853		ah_attr->is_global = 1;
854		version = get_grh_header_version(grh);
855
856		if (version == 4)
857			ret = set_ah_attr_by_ipv4(context, ah_attr,
858						  (struct ip *)((void *)grh + 20),
859						  port_num);
860		else if (version == 6)
861			ret = set_ah_attr_by_ipv6(context, ah_attr, grh,
862						  port_num);
863		else
864			ret = -1;
865	}
866
867	return ret;
868}
869
870struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
871				     struct ibv_grh *grh, uint8_t port_num)
872{
873	struct ibv_ah_attr ah_attr;
874	int ret;
875
876	ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr);
877	if (ret)
878		return NULL;
879
880	return ibv_create_ah(pd, &ah_attr);
881}
882
883int __ibv_destroy_ah(struct ibv_ah *ah)
884{
885	return ah->context->ops.destroy_ah(ah);
886}
887default_symver(__ibv_destroy_ah, ibv_destroy_ah);
888
889int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
890{
891	return qp->context->ops.attach_mcast(qp, gid, lid);
892}
893default_symver(__ibv_attach_mcast, ibv_attach_mcast);
894
895int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
896{
897	return qp->context->ops.detach_mcast(qp, gid, lid);
898}
899default_symver(__ibv_detach_mcast, ibv_detach_mcast);
900
901static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
902{
903	return IN6_IS_ADDR_V4MAPPED(a) ||
904		/* IPv4 encoded multicast addresses */
905		(a->s6_addr32[0]  == htobe32(0xff0e0000) &&
906		((a->s6_addr32[1] |
907		 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL));
908}
909
910struct peer_address {
911	void *address;
912	uint32_t size;
913};
914
915static inline int create_peer_from_gid(int family, void *raw_gid,
916				       struct peer_address *peer_address)
917{
918	switch (family) {
919	case AF_INET:
920		peer_address->address = raw_gid + 12;
921		peer_address->size = 4;
922		break;
923	case AF_INET6:
924		peer_address->address = raw_gid;
925		peer_address->size = 16;
926		break;
927	default:
928		return -1;
929	}
930
931	return 0;
932}
933
934#define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000
935int ibv_resolve_eth_l2_from_gid(struct ibv_context *context,
936				struct ibv_ah_attr *attr,
937				uint8_t eth_mac[ETHERNET_LL_SIZE],
938				uint16_t *vid)
939{
940#ifndef NRESOLVE_NEIGH
941	int dst_family;
942	int src_family;
943	int oif;
944	struct get_neigh_handler neigh_handler;
945	union ibv_gid sgid;
946	int ether_len;
947	struct peer_address src;
948	struct peer_address dst;
949	uint16_t ret_vid;
950	int ret = -EINVAL;
951	int err;
952
953	err = ibv_query_gid(context, attr->port_num,
954			    attr->grh.sgid_index, &sgid);
955
956	if (err)
957		return err;
958
959	err = neigh_init_resources(&neigh_handler,
960				   NEIGH_GET_DEFAULT_TIMEOUT_MS);
961
962	if (err)
963		return err;
964
965	dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ?
966			AF_INET : AF_INET6;
967	src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ?
968			AF_INET : AF_INET6;
969
970	if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst))
971		goto free_resources;
972
973	if (create_peer_from_gid(src_family, &sgid.raw, &src))
974		goto free_resources;
975
976	if (neigh_set_dst(&neigh_handler, dst_family, dst.address,
977			  dst.size))
978		goto free_resources;
979
980	if (neigh_set_src(&neigh_handler, src_family, src.address,
981			  src.size))
982		goto free_resources;
983
984	oif = neigh_get_oif_from_src(&neigh_handler);
985
986	if (oif > 0)
987		neigh_set_oif(&neigh_handler, oif);
988	else
989		goto free_resources;
990
991	ret = -EHOSTUNREACH;
992
993	/* blocking call */
994	if (process_get_neigh(&neigh_handler))
995		goto free_resources;
996
997	ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler);
998
999	if (ret_vid <= 0xfff)
1000		neigh_set_vlan_id(&neigh_handler, ret_vid);
1001
1002	/* We are using only Ethernet here */
1003	ether_len = neigh_get_ll(&neigh_handler,
1004				 eth_mac,
1005				 sizeof(uint8_t) * ETHERNET_LL_SIZE);
1006
1007	if (ether_len <= 0)
1008		goto free_resources;
1009
1010	*vid = ret_vid;
1011
1012	ret = 0;
1013
1014free_resources:
1015	neigh_free_resources(&neigh_handler);
1016
1017	return ret;
1018#else
1019	return -ENOSYS;
1020#endif
1021}
1022