1/*
2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#define _GNU_SOURCE
35#include <config.h>
36
37#include <infiniband/endian.h>
38#include <stdio.h>
39#include <unistd.h>
40#include <stdlib.h>
41#include <errno.h>
42#include <string.h>
43#include <dirent.h>
44#include <netinet/in.h>
45#include <netinet/ip.h>
46#include <sys/socket.h>
47
48#include "ibverbs.h"
49#ifndef NRESOLVE_NEIGH
50#include <net/if.h>
51#include <net/if_arp.h>
52#include "neigh.h"
53#endif
54
55/* Hack to avoid GCC's -Wmissing-prototypes and the similar error from sparse
56   with these prototypes. Symbol versionining requires the goofy names, the
57   prototype must match the version in verbs.h.
58 */
59int __ibv_query_device(struct ibv_context *context,
60		       struct ibv_device_attr *device_attr);
61int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
62		     struct ibv_port_attr *port_attr);
63int __ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index,
64		    union ibv_gid *gid);
65int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int index,
66		     __be16 *pkey);
67struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context);
68int __ibv_dealloc_pd(struct ibv_pd *pd);
69struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
70			    int access);
71int __ibv_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, void *addr,
72		   size_t length, int access);
73int __ibv_dereg_mr(struct ibv_mr *mr);
74struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe,
75			       void *cq_context,
76			       struct ibv_comp_channel *channel,
77			       int comp_vector);
78int __ibv_resize_cq(struct ibv_cq *cq, int cqe);
79int __ibv_destroy_cq(struct ibv_cq *cq);
80int __ibv_get_cq_event(struct ibv_comp_channel *channel, struct ibv_cq **cq,
81		       void **cq_context);
82void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents);
83struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
84				 struct ibv_srq_init_attr *srq_init_attr);
85int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr,
86		     int srq_attr_mask);
87int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr);
88int __ibv_destroy_srq(struct ibv_srq *srq);
89struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
90			       struct ibv_qp_init_attr *qp_init_attr);
91int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
92		   struct ibv_qp_init_attr *init_attr);
93int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
94int __ibv_destroy_qp(struct ibv_qp *qp);
95struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
96int __ibv_destroy_ah(struct ibv_ah *ah);
97int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
98		       uint16_t lid);
99int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
100		       uint16_t lid);
101
102int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate)
103{
104	switch (rate) {
105	case IBV_RATE_2_5_GBPS: return  1;
106	case IBV_RATE_5_GBPS:   return  2;
107	case IBV_RATE_10_GBPS:  return  4;
108	case IBV_RATE_20_GBPS:  return  8;
109	case IBV_RATE_30_GBPS:  return 12;
110	case IBV_RATE_40_GBPS:  return 16;
111	case IBV_RATE_60_GBPS:  return 24;
112	case IBV_RATE_80_GBPS:  return 32;
113	case IBV_RATE_120_GBPS: return 48;
114	case IBV_RATE_28_GBPS:  return 11;
115	case IBV_RATE_50_GBPS:  return 20;
116	case IBV_RATE_400_GBPS: return 160;
117	case IBV_RATE_600_GBPS: return 240;
118	default:           return -1;
119	}
120}
121
122enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult)
123{
124	switch (mult) {
125	case 1:  return IBV_RATE_2_5_GBPS;
126	case 2:  return IBV_RATE_5_GBPS;
127	case 4:  return IBV_RATE_10_GBPS;
128	case 8:  return IBV_RATE_20_GBPS;
129	case 12: return IBV_RATE_30_GBPS;
130	case 16: return IBV_RATE_40_GBPS;
131	case 24: return IBV_RATE_60_GBPS;
132	case 32: return IBV_RATE_80_GBPS;
133	case 48: return IBV_RATE_120_GBPS;
134	case 11: return IBV_RATE_28_GBPS;
135	case 20: return IBV_RATE_50_GBPS;
136	case 160: return IBV_RATE_400_GBPS;
137	case 240: return IBV_RATE_600_GBPS;
138	default: return IBV_RATE_MAX;
139	}
140}
141
142int  __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate)
143{
144	switch (rate) {
145	case IBV_RATE_2_5_GBPS: return 2500;
146	case IBV_RATE_5_GBPS:   return 5000;
147	case IBV_RATE_10_GBPS:  return 10000;
148	case IBV_RATE_20_GBPS:  return 20000;
149	case IBV_RATE_30_GBPS:  return 30000;
150	case IBV_RATE_40_GBPS:  return 40000;
151	case IBV_RATE_60_GBPS:  return 60000;
152	case IBV_RATE_80_GBPS:  return 80000;
153	case IBV_RATE_120_GBPS: return 120000;
154	case IBV_RATE_14_GBPS:  return 14062;
155	case IBV_RATE_56_GBPS:  return 56250;
156	case IBV_RATE_112_GBPS: return 112500;
157	case IBV_RATE_168_GBPS: return 168750;
158	case IBV_RATE_25_GBPS:  return 25781;
159	case IBV_RATE_100_GBPS: return 103125;
160	case IBV_RATE_200_GBPS: return 206250;
161	case IBV_RATE_300_GBPS: return 309375;
162	case IBV_RATE_28_GBPS:  return 28125;
163	case IBV_RATE_50_GBPS:  return 53125;
164	case IBV_RATE_400_GBPS: return 425000;
165	case IBV_RATE_600_GBPS: return 637500;
166	default:               return -1;
167	}
168}
169
170enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps)
171{
172	switch (mbps) {
173	case 2500:   return IBV_RATE_2_5_GBPS;
174	case 5000:   return IBV_RATE_5_GBPS;
175	case 10000:  return IBV_RATE_10_GBPS;
176	case 20000:  return IBV_RATE_20_GBPS;
177	case 30000:  return IBV_RATE_30_GBPS;
178	case 40000:  return IBV_RATE_40_GBPS;
179	case 60000:  return IBV_RATE_60_GBPS;
180	case 80000:  return IBV_RATE_80_GBPS;
181	case 120000: return IBV_RATE_120_GBPS;
182	case 14062:  return IBV_RATE_14_GBPS;
183	case 56250:  return IBV_RATE_56_GBPS;
184	case 112500: return IBV_RATE_112_GBPS;
185	case 168750: return IBV_RATE_168_GBPS;
186	case 25781:  return IBV_RATE_25_GBPS;
187	case 103125: return IBV_RATE_100_GBPS;
188	case 206250: return IBV_RATE_200_GBPS;
189	case 309375: return IBV_RATE_300_GBPS;
190	case 28125:  return IBV_RATE_28_GBPS;
191	case 53125:  return IBV_RATE_50_GBPS;
192	case 425000: return IBV_RATE_400_GBPS;
193	case 637500: return IBV_RATE_600_GBPS;
194	default:     return IBV_RATE_MAX;
195	}
196}
197
198int __ibv_query_device(struct ibv_context *context,
199		       struct ibv_device_attr *device_attr)
200{
201	return context->ops.query_device(context, device_attr);
202}
203default_symver(__ibv_query_device, ibv_query_device);
204
205int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
206		     struct ibv_port_attr *port_attr)
207{
208	return context->ops.query_port(context, port_num, port_attr);
209}
210default_symver(__ibv_query_port, ibv_query_port);
211
212int __ibv_query_gid(struct ibv_context *context, uint8_t port_num,
213		    int index, union ibv_gid *gid)
214{
215	char name[24];
216	char attr[41];
217	uint16_t val;
218	int i;
219
220	snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index);
221
222	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
223				attr, sizeof attr) < 0)
224		return -1;
225
226	for (i = 0; i < 8; ++i) {
227		if (sscanf(attr + i * 5, "%hx", &val) != 1)
228			return -1;
229		gid->raw[i * 2    ] = val >> 8;
230		gid->raw[i * 2 + 1] = val & 0xff;
231	}
232
233	return 0;
234}
235default_symver(__ibv_query_gid, ibv_query_gid);
236
237int __ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
238		     int index, __be16 *pkey)
239{
240	char name[24];
241	char attr[8];
242	uint16_t val;
243
244	snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index);
245
246	if (ibv_read_sysfs_file(context->device->ibdev_path, name,
247				attr, sizeof attr) < 0)
248		return -1;
249
250	if (sscanf(attr, "%hx", &val) != 1)
251		return -1;
252
253	*pkey = htobe16(val);
254	return 0;
255}
256default_symver(__ibv_query_pkey, ibv_query_pkey);
257
258struct ibv_pd *__ibv_alloc_pd(struct ibv_context *context)
259{
260	struct ibv_pd *pd;
261
262	pd = context->ops.alloc_pd(context);
263	if (pd)
264		pd->context = context;
265
266	return pd;
267}
268default_symver(__ibv_alloc_pd, ibv_alloc_pd);
269
270int __ibv_dealloc_pd(struct ibv_pd *pd)
271{
272	return pd->context->ops.dealloc_pd(pd);
273}
274default_symver(__ibv_dealloc_pd, ibv_dealloc_pd);
275
276struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr,
277			    size_t length, int access)
278{
279	struct ibv_mr *mr;
280
281	if (ibv_dontfork_range(addr, length))
282		return NULL;
283
284	mr = pd->context->ops.reg_mr(pd, addr, length, access);
285	if (mr) {
286		mr->context = pd->context;
287		mr->pd      = pd;
288		mr->addr    = addr;
289		mr->length  = length;
290	} else
291		ibv_dofork_range(addr, length);
292
293	return mr;
294}
295default_symver(__ibv_reg_mr, ibv_reg_mr);
296
297int __ibv_rereg_mr(struct ibv_mr *mr, int flags,
298		   struct ibv_pd *pd, void *addr,
299		   size_t length, int access)
300{
301	int dofork_onfail = 0;
302	int err;
303	void *old_addr;
304	size_t old_len;
305
306	if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) {
307		errno = EINVAL;
308		return IBV_REREG_MR_ERR_INPUT;
309	}
310
311	if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) &&
312	    (!length || !addr)) {
313		errno = EINVAL;
314		return IBV_REREG_MR_ERR_INPUT;
315	}
316
317	if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) {
318		errno = EINVAL;
319		return IBV_REREG_MR_ERR_INPUT;
320	}
321
322	if (!mr->context->ops.rereg_mr) {
323		errno = ENOSYS;
324		return IBV_REREG_MR_ERR_INPUT;
325	}
326
327	if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
328		err = ibv_dontfork_range(addr, length);
329		if (err)
330			return IBV_REREG_MR_ERR_DONT_FORK_NEW;
331		dofork_onfail = 1;
332	}
333
334	old_addr = mr->addr;
335	old_len = mr->length;
336	err = mr->context->ops.rereg_mr(mr, flags, pd, addr, length, access);
337	if (!err) {
338		if (flags & IBV_REREG_MR_CHANGE_PD)
339			mr->pd = pd;
340		if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) {
341			mr->addr    = addr;
342			mr->length  = length;
343			err = ibv_dofork_range(old_addr, old_len);
344			if (err)
345				return IBV_REREG_MR_ERR_DO_FORK_OLD;
346		}
347	} else {
348		err = IBV_REREG_MR_ERR_CMD;
349		if (dofork_onfail) {
350			if (ibv_dofork_range(addr, length))
351				err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW;
352		}
353	}
354
355	return err;
356}
357default_symver(__ibv_rereg_mr, ibv_rereg_mr);
358
359int __ibv_dereg_mr(struct ibv_mr *mr)
360{
361	int ret;
362	void *addr	= mr->addr;
363	size_t length	= mr->length;
364
365	ret = mr->context->ops.dereg_mr(mr);
366	if (!ret)
367		ibv_dofork_range(addr, length);
368
369	return ret;
370}
371default_symver(__ibv_dereg_mr, ibv_dereg_mr);
372
373static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
374{
375	struct ibv_abi_compat_v2 *t = context->abi_compat;
376	static int warned;
377
378	if (!pthread_mutex_trylock(&t->in_use))
379		return &t->channel;
380
381	if (!warned) {
382		fprintf(stderr, PFX "Warning: kernel's ABI version %d limits capacity.\n"
383			"    Only one completion channel can be created per context.\n",
384			abi_ver);
385		++warned;
386	}
387
388	return NULL;
389}
390
391struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
392{
393	struct ibv_comp_channel            *channel;
394	struct ibv_create_comp_channel      cmd;
395	struct ibv_create_comp_channel_resp resp;
396
397	if (abi_ver <= 2)
398		return ibv_create_comp_channel_v2(context);
399
400	channel = malloc(sizeof *channel);
401	if (!channel)
402		return NULL;
403
404	IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_COMP_CHANNEL, &resp, sizeof resp);
405	if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) {
406		free(channel);
407		return NULL;
408	}
409
410	(void) VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
411
412	channel->context = context;
413	channel->fd      = resp.fd;
414	channel->refcnt  = 0;
415
416	return channel;
417}
418
419static int ibv_destroy_comp_channel_v2(struct ibv_comp_channel *channel)
420{
421	struct ibv_abi_compat_v2 *t = (struct ibv_abi_compat_v2 *) channel;
422	pthread_mutex_unlock(&t->in_use);
423	return 0;
424}
425
426int ibv_destroy_comp_channel(struct ibv_comp_channel *channel)
427{
428	struct ibv_context *context;
429	int ret;
430
431	context = channel->context;
432	pthread_mutex_lock(&context->mutex);
433
434	if (channel->refcnt) {
435		ret = EBUSY;
436		goto out;
437	}
438
439	if (abi_ver <= 2) {
440		ret = ibv_destroy_comp_channel_v2(channel);
441		goto out;
442	}
443
444	close(channel->fd);
445	free(channel);
446	ret = 0;
447
448out:
449	pthread_mutex_unlock(&context->mutex);
450
451	return ret;
452}
453
454struct ibv_cq *__ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context,
455			       struct ibv_comp_channel *channel, int comp_vector)
456{
457	struct ibv_cq *cq;
458
459	cq = context->ops.create_cq(context, cqe, channel, comp_vector);
460
461	if (cq)
462		verbs_init_cq(cq, context, channel, cq_context);
463
464	return cq;
465}
466default_symver(__ibv_create_cq, ibv_create_cq);
467
468int __ibv_resize_cq(struct ibv_cq *cq, int cqe)
469{
470	if (!cq->context->ops.resize_cq)
471		return ENOSYS;
472
473	return cq->context->ops.resize_cq(cq, cqe);
474}
475default_symver(__ibv_resize_cq, ibv_resize_cq);
476
477int __ibv_destroy_cq(struct ibv_cq *cq)
478{
479	struct ibv_comp_channel *channel = cq->channel;
480	int ret;
481
482	ret = cq->context->ops.destroy_cq(cq);
483
484	if (channel) {
485		if (!ret) {
486			pthread_mutex_lock(&channel->context->mutex);
487			--channel->refcnt;
488			pthread_mutex_unlock(&channel->context->mutex);
489		}
490	}
491
492	return ret;
493}
494default_symver(__ibv_destroy_cq, ibv_destroy_cq);
495
496int __ibv_get_cq_event(struct ibv_comp_channel *channel,
497		       struct ibv_cq **cq, void **cq_context)
498{
499	struct ibv_comp_event ev;
500
501	if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
502		return -1;
503
504	*cq         = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
505	*cq_context = (*cq)->cq_context;
506
507	if ((*cq)->context->ops.cq_event)
508		(*cq)->context->ops.cq_event(*cq);
509
510	return 0;
511}
512default_symver(__ibv_get_cq_event, ibv_get_cq_event);
513
514void __ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
515{
516	pthread_mutex_lock(&cq->mutex);
517	cq->comp_events_completed += nevents;
518	pthread_cond_signal(&cq->cond);
519	pthread_mutex_unlock(&cq->mutex);
520}
521default_symver(__ibv_ack_cq_events, ibv_ack_cq_events);
522
523struct ibv_srq *__ibv_create_srq(struct ibv_pd *pd,
524				 struct ibv_srq_init_attr *srq_init_attr)
525{
526	struct ibv_srq *srq;
527
528	if (!pd->context->ops.create_srq)
529		return NULL;
530
531	srq = pd->context->ops.create_srq(pd, srq_init_attr);
532	if (srq) {
533		srq->context          = pd->context;
534		srq->srq_context      = srq_init_attr->srq_context;
535		srq->pd               = pd;
536		srq->events_completed = 0;
537		pthread_mutex_init(&srq->mutex, NULL);
538		pthread_cond_init(&srq->cond, NULL);
539	}
540
541	return srq;
542}
543default_symver(__ibv_create_srq, ibv_create_srq);
544
545int __ibv_modify_srq(struct ibv_srq *srq,
546		     struct ibv_srq_attr *srq_attr,
547		     int srq_attr_mask)
548{
549	return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask);
550}
551default_symver(__ibv_modify_srq, ibv_modify_srq);
552
553int __ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr)
554{
555	return srq->context->ops.query_srq(srq, srq_attr);
556}
557default_symver(__ibv_query_srq, ibv_query_srq);
558
559int __ibv_destroy_srq(struct ibv_srq *srq)
560{
561	return srq->context->ops.destroy_srq(srq);
562}
563default_symver(__ibv_destroy_srq, ibv_destroy_srq);
564
565struct ibv_qp *__ibv_create_qp(struct ibv_pd *pd,
566			       struct ibv_qp_init_attr *qp_init_attr)
567{
568	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr);
569
570	if (qp) {
571		qp->context    	     = pd->context;
572		qp->qp_context 	     = qp_init_attr->qp_context;
573		qp->pd         	     = pd;
574		qp->send_cq    	     = qp_init_attr->send_cq;
575		qp->recv_cq    	     = qp_init_attr->recv_cq;
576		qp->srq        	     = qp_init_attr->srq;
577		qp->qp_type          = qp_init_attr->qp_type;
578		qp->state	     = IBV_QPS_RESET;
579		qp->events_completed = 0;
580		pthread_mutex_init(&qp->mutex, NULL);
581		pthread_cond_init(&qp->cond, NULL);
582	}
583
584	return qp;
585}
586default_symver(__ibv_create_qp, ibv_create_qp);
587
588int __ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
589		   int attr_mask,
590		   struct ibv_qp_init_attr *init_attr)
591{
592	int ret;
593
594	ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr);
595	if (ret)
596		return ret;
597
598	if (attr_mask & IBV_QP_STATE)
599		qp->state = attr->qp_state;
600
601	return 0;
602}
603default_symver(__ibv_query_qp, ibv_query_qp);
604
605int __ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
606		    int attr_mask)
607{
608	int ret;
609
610	ret = qp->context->ops.modify_qp(qp, attr, attr_mask);
611	if (ret)
612		return ret;
613
614	if (attr_mask & IBV_QP_STATE)
615		qp->state = attr->qp_state;
616
617	return 0;
618}
619default_symver(__ibv_modify_qp, ibv_modify_qp);
620
621int __ibv_destroy_qp(struct ibv_qp *qp)
622{
623	return qp->context->ops.destroy_qp(qp);
624}
625default_symver(__ibv_destroy_qp, ibv_destroy_qp);
626
627struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
628{
629	struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr);
630
631	if (ah) {
632		ah->context = pd->context;
633		ah->pd      = pd;
634	}
635
636	return ah;
637}
638default_symver(__ibv_create_ah, ibv_create_ah);
639
640/* GID types as appear in sysfs, no change is expected as of ABI
641 * compatibility.
642 */
643#define V1_TYPE "IB/RoCE v1"
644#define V2_TYPE "RoCE v2"
645int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num,
646		       unsigned int index, enum ibv_gid_type *type)
647{
648	char name[32];
649	char buff[11];
650
651	snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num,
652		 index);
653
654	/* Reset errno so that we can rely on its value upon any error flow in
655	 * ibv_read_sysfs_file.
656	 */
657	errno = 0;
658	if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff,
659				sizeof(buff)) <= 0) {
660		char *dir_path;
661		DIR *dir;
662
663		if (errno == EINVAL) {
664			/* In IB, this file doesn't exist and the kernel sets
665			 * errno to -EINVAL.
666			 */
667			*type = IBV_GID_TYPE_IB_ROCE_V1;
668			return 0;
669		}
670		if (asprintf(&dir_path, "%s/%s/%d/%s/",
671			     context->device->ibdev_path, "ports", port_num,
672			     "gid_attrs") < 0)
673			return -1;
674		dir = opendir(dir_path);
675		free(dir_path);
676		if (!dir) {
677			if (errno == ENOENT)
678				/* Assuming that if gid_attrs doesn't exist,
679				 * we have an old kernel and all GIDs are
680				 * IB/RoCE v1
681				 */
682				*type = IBV_GID_TYPE_IB_ROCE_V1;
683			else
684				return -1;
685		} else {
686			closedir(dir);
687			errno = EFAULT;
688			return -1;
689		}
690	} else {
691		if (!strcmp(buff, V1_TYPE)) {
692			*type = IBV_GID_TYPE_IB_ROCE_V1;
693		} else if (!strcmp(buff, V2_TYPE)) {
694			*type = IBV_GID_TYPE_ROCE_V2;
695		} else {
696			errno = ENOTSUP;
697			return -1;
698		}
699	}
700
701	return 0;
702}
703
704static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num,
705			      union ibv_gid *gid, enum ibv_gid_type gid_type)
706{
707	enum ibv_gid_type sgid_type = 0;
708	union ibv_gid sgid;
709	int i = 0, ret;
710
711	do {
712		ret = ibv_query_gid(context, port_num, i, &sgid);
713		if (!ret) {
714			ret = ibv_query_gid_type(context, port_num, i,
715						 &sgid_type);
716		}
717		i++;
718	} while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) ||
719		 (gid_type != sgid_type)));
720
721	return ret ? ret : i - 1;
722}
723
724static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6)
725{
726	ipv6->s6_addr32[0] = 0;
727	ipv6->s6_addr32[1] = 0;
728	ipv6->s6_addr32[2] = htobe32(0x0000FFFF);
729	ipv6->s6_addr32[3] = ipv4;
730}
731
732static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords)
733{
734	unsigned int i = 0;
735	uint32_t sum = 0;
736
737	for (i = 0; i < num_hwords; i++)
738		sum += *(data++);
739
740	sum = (sum & 0xffff) + (sum >> 16);
741
742	return (__sum16)~sum;
743}
744
745static inline int get_grh_header_version(struct ibv_grh *grh)
746{
747	int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf;
748	struct ip *ip4h = (struct ip *)((void *)grh + 20);
749	struct ip ip4h_checked;
750
751	if (ip6h_version != 6) {
752		if (ip4h->ip_v == 4)
753			return 4;
754		errno = EPROTONOSUPPORT;
755		return -1;
756	}
757	/* version may be 6 or 4 */
758	if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RoCE v2. */
759		return 6;
760	/*
761	* Verify checksum.
762	* We can't write on scattered buffers so we have to copy to temp
763	* buffer.
764	*/
765	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
766	/* Need to set the checksum field (check) to 0 before re-calculating
767	 * the checksum.
768	 */
769	ip4h_checked.ip_sum = 0;
770	ip4h_checked.ip_sum = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10);
771	/* if IPv4 header checksum is OK, believe it */
772	if (ip4h->ip_sum == ip4h_checked.ip_sum)
773		return 4;
774	return 6;
775}
776
777static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr,
778					      struct ibv_wc *wc,
779					      struct ibv_grh *grh,
780					      uint8_t port_num)
781{
782	uint32_t flow_class;
783
784	flow_class = be32toh(grh->version_tclass_flow);
785	ah_attr->grh.flow_label = flow_class & 0xFFFFF;
786	ah_attr->dlid = wc->slid;
787	ah_attr->sl = wc->sl;
788	ah_attr->src_path_bits = wc->dlid_path_bits;
789	ah_attr->port_num = port_num;
790}
791
792static inline int set_ah_attr_by_ipv4(struct ibv_context *context,
793				      struct ibv_ah_attr *ah_attr,
794				      struct ip *ip4h, uint8_t port_num)
795{
796	union ibv_gid sgid;
797	int ret;
798
799	/* No point searching multicast GIDs in GID table */
800	if (IN_CLASSD(be32toh(ip4h->ip_dst.s_addr))) {
801		errno = EINVAL;
802		return -1;
803	}
804
805	map_ipv4_addr_to_ipv6(ip4h->ip_dst.s_addr, (struct in6_addr *)&sgid);
806	ret = ibv_find_gid_index(context, port_num, &sgid,
807				 IBV_GID_TYPE_ROCE_V2);
808	if (ret < 0)
809		return ret;
810
811	map_ipv4_addr_to_ipv6(ip4h->ip_src.s_addr,
812			      (struct in6_addr *)&ah_attr->grh.dgid);
813	ah_attr->grh.sgid_index = (uint8_t) ret;
814	ah_attr->grh.hop_limit = ip4h->ip_ttl;
815	ah_attr->grh.traffic_class = ip4h->ip_tos;
816
817	return 0;
818}
819
820#define IB_NEXT_HDR    0x1b
821static inline int set_ah_attr_by_ipv6(struct ibv_context *context,
822				  struct ibv_ah_attr *ah_attr,
823				  struct ibv_grh *grh, uint8_t port_num)
824{
825	uint32_t flow_class;
826	uint32_t sgid_type;
827	int ret;
828
829	/* No point searching multicast GIDs in GID table */
830	if (grh->dgid.raw[0] == 0xFF) {
831		errno = EINVAL;
832		return -1;
833	}
834
835	ah_attr->grh.dgid = grh->sgid;
836	if (grh->next_hdr == IPPROTO_UDP) {
837		sgid_type = IBV_GID_TYPE_ROCE_V2;
838	} else if (grh->next_hdr == IB_NEXT_HDR) {
839		sgid_type = IBV_GID_TYPE_IB_ROCE_V1;
840	} else {
841		errno = EPROTONOSUPPORT;
842		return -1;
843	}
844
845	ret = ibv_find_gid_index(context, port_num, &grh->dgid,
846				 sgid_type);
847	if (ret < 0)
848		return ret;
849
850	ah_attr->grh.sgid_index = (uint8_t) ret;
851	flow_class = be32toh(grh->version_tclass_flow);
852	ah_attr->grh.hop_limit = grh->hop_limit;
853	ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
854
855	return 0;
856}
857
858int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
859			struct ibv_wc *wc, struct ibv_grh *grh,
860			struct ibv_ah_attr *ah_attr)
861{
862	int version;
863	int ret = 0;
864
865	memset(ah_attr, 0, sizeof *ah_attr);
866	set_ah_attr_generic_fields(ah_attr, wc, grh, port_num);
867
868	if (wc->wc_flags & IBV_WC_GRH) {
869		ah_attr->is_global = 1;
870		version = get_grh_header_version(grh);
871
872		if (version == 4)
873			ret = set_ah_attr_by_ipv4(context, ah_attr,
874						  (struct ip *)((void *)grh + 20),
875						  port_num);
876		else if (version == 6)
877			ret = set_ah_attr_by_ipv6(context, ah_attr, grh,
878						  port_num);
879		else
880			ret = -1;
881	}
882
883	return ret;
884}
885
886struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
887				     struct ibv_grh *grh, uint8_t port_num)
888{
889	struct ibv_ah_attr ah_attr;
890	int ret;
891
892	ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr);
893	if (ret)
894		return NULL;
895
896	return ibv_create_ah(pd, &ah_attr);
897}
898
899int __ibv_destroy_ah(struct ibv_ah *ah)
900{
901	return ah->context->ops.destroy_ah(ah);
902}
903default_symver(__ibv_destroy_ah, ibv_destroy_ah);
904
905int __ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
906{
907	return qp->context->ops.attach_mcast(qp, gid, lid);
908}
909default_symver(__ibv_attach_mcast, ibv_attach_mcast);
910
911int __ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
912{
913	return qp->context->ops.detach_mcast(qp, gid, lid);
914}
915default_symver(__ibv_detach_mcast, ibv_detach_mcast);
916
917static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
918{
919	return IN6_IS_ADDR_V4MAPPED(a) ||
920		/* IPv4 encoded multicast addresses */
921		(a->s6_addr32[0]  == htobe32(0xff0e0000) &&
922		((a->s6_addr32[1] |
923		 (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL));
924}
925
926struct peer_address {
927	void *address;
928	uint32_t size;
929};
930
931static inline int create_peer_from_gid(int family, void *raw_gid,
932				       struct peer_address *peer_address)
933{
934	switch (family) {
935	case AF_INET:
936		peer_address->address = raw_gid + 12;
937		peer_address->size = 4;
938		break;
939	case AF_INET6:
940		peer_address->address = raw_gid;
941		peer_address->size = 16;
942		break;
943	default:
944		return -1;
945	}
946
947	return 0;
948}
949
950#define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000
951int ibv_resolve_eth_l2_from_gid(struct ibv_context *context,
952				struct ibv_ah_attr *attr,
953				uint8_t eth_mac[ETHERNET_LL_SIZE],
954				uint16_t *vid)
955{
956#ifndef NRESOLVE_NEIGH
957	int dst_family;
958	int src_family;
959	int oif;
960	struct get_neigh_handler neigh_handler;
961	union ibv_gid sgid;
962	int ether_len;
963	struct peer_address src;
964	struct peer_address dst;
965	uint16_t ret_vid;
966	int ret = -EINVAL;
967	int err;
968
969	err = ibv_query_gid(context, attr->port_num,
970			    attr->grh.sgid_index, &sgid);
971
972	if (err)
973		return err;
974
975	err = neigh_init_resources(&neigh_handler,
976				   NEIGH_GET_DEFAULT_TIMEOUT_MS);
977
978	if (err)
979		return err;
980
981	dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ?
982			AF_INET : AF_INET6;
983	src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ?
984			AF_INET : AF_INET6;
985
986	if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst))
987		goto free_resources;
988
989	if (create_peer_from_gid(src_family, &sgid.raw, &src))
990		goto free_resources;
991
992	if (neigh_set_dst(&neigh_handler, dst_family, dst.address,
993			  dst.size))
994		goto free_resources;
995
996	if (neigh_set_src(&neigh_handler, src_family, src.address,
997			  src.size))
998		goto free_resources;
999
1000	oif = neigh_get_oif_from_src(&neigh_handler);
1001
1002	if (oif > 0)
1003		neigh_set_oif(&neigh_handler, oif);
1004	else
1005		goto free_resources;
1006
1007	ret = -EHOSTUNREACH;
1008
1009	/* blocking call */
1010	if (process_get_neigh(&neigh_handler))
1011		goto free_resources;
1012
1013	ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler);
1014
1015	if (ret_vid <= 0xfff)
1016		neigh_set_vlan_id(&neigh_handler, ret_vid);
1017
1018	/* We are using only Ethernet here */
1019	ether_len = neigh_get_ll(&neigh_handler,
1020				 eth_mac,
1021				 sizeof(uint8_t) * ETHERNET_LL_SIZE);
1022
1023	if (ether_len <= 0)
1024		goto free_resources;
1025
1026	*vid = ret_vid;
1027
1028	ret = 0;
1029
1030free_resources:
1031	neigh_free_resources(&neigh_handler);
1032
1033	return ret;
1034#else
1035	return -ENOSYS;
1036#endif
1037}
1038