verbs.c revision 331769
1/*
2 * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <config.h>
34
35#include <infiniband/endian.h>
36#include <stdlib.h>
37#include <stdio.h>
38#include <string.h>
39#include <pthread.h>
40#include <errno.h>
41
42#include "mlx4.h"
43#include "mlx4-abi.h"
44#include "wqe.h"
45
46int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
47{
48	struct ibv_query_device cmd;
49	uint64_t raw_fw_ver;
50	unsigned major, minor, sub_minor;
51	int ret;
52
53	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
54	if (ret)
55		return ret;
56
57	major     = (raw_fw_ver >> 32) & 0xffff;
58	minor     = (raw_fw_ver >> 16) & 0xffff;
59	sub_minor = raw_fw_ver & 0xffff;
60
61	snprintf(attr->fw_ver, sizeof attr->fw_ver,
62		 "%d.%d.%03d", major, minor, sub_minor);
63
64	return 0;
65}
66
67int mlx4_query_device_ex(struct ibv_context *context,
68			 const struct ibv_query_device_ex_input *input,
69			 struct ibv_device_attr_ex *attr,
70			 size_t attr_size)
71{
72	struct mlx4_context *mctx = to_mctx(context);
73	struct mlx4_query_device_ex_resp resp = {};
74	struct mlx4_query_device_ex cmd = {};
75	uint64_t raw_fw_ver;
76	unsigned sub_minor;
77	unsigned major;
78	unsigned minor;
79	int err;
80
81	err = ibv_cmd_query_device_ex(context, input, attr, attr_size,
82				      &raw_fw_ver,
83				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
84				      &resp.ibv_resp, sizeof(resp.ibv_resp),
85				      sizeof(resp));
86	if (err)
87		return err;
88
89	if (resp.comp_mask & MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) {
90		mctx->core_clock.offset = resp.hca_core_clock_offset;
91		mctx->core_clock.offset_valid = 1;
92	}
93
94	major     = (raw_fw_ver >> 32) & 0xffff;
95	minor     = (raw_fw_ver >> 16) & 0xffff;
96	sub_minor = raw_fw_ver & 0xffff;
97
98	snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver,
99		 "%d.%d.%03d", major, minor, sub_minor);
100
101	return 0;
102}
103
104#define READL(ptr) (*((uint32_t *)(ptr)))
105static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles)
106{
107	unsigned int clockhi, clocklo, clockhi1;
108	int i;
109	struct mlx4_context *ctx = to_mctx(context);
110
111	if (!ctx->hca_core_clock)
112		return -EOPNOTSUPP;
113
114	/* Handle wraparound */
115	for (i = 0; i < 2; i++) {
116		clockhi = be32toh(READL(ctx->hca_core_clock));
117		clocklo = be32toh(READL(ctx->hca_core_clock + 4));
118		clockhi1 = be32toh(READL(ctx->hca_core_clock));
119		if (clockhi == clockhi1)
120			break;
121	}
122
123	*cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo;
124
125	return 0;
126}
127
128int mlx4_query_rt_values(struct ibv_context *context,
129			 struct ibv_values_ex *values)
130{
131	uint32_t comp_mask = 0;
132	int err = 0;
133
134	if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) {
135		uint64_t cycles;
136
137		err = mlx4_read_clock(context, &cycles);
138		if (!err) {
139			values->raw_clock.tv_sec = 0;
140			values->raw_clock.tv_nsec = cycles;
141			comp_mask |= IBV_VALUES_MASK_RAW_CLOCK;
142		}
143	}
144
145	values->comp_mask = comp_mask;
146
147	return err;
148}
149
150int mlx4_query_port(struct ibv_context *context, uint8_t port,
151		     struct ibv_port_attr *attr)
152{
153	struct ibv_query_port cmd;
154	int err;
155
156	err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
157	if (!err && port <= MLX4_PORTS_NUM && port > 0) {
158		struct mlx4_context *mctx = to_mctx(context);
159		if (!mctx->port_query_cache[port - 1].valid) {
160			mctx->port_query_cache[port - 1].link_layer =
161				attr->link_layer;
162			mctx->port_query_cache[port - 1].caps =
163				attr->port_cap_flags;
164			mctx->port_query_cache[port - 1].valid = 1;
165		}
166	}
167
168	return err;
169}
170
171/* Only the fields in the port cache will be valid */
172static int query_port_cache(struct ibv_context *context, uint8_t port_num,
173			    struct ibv_port_attr *port_attr)
174{
175	struct mlx4_context *mctx = to_mctx(context);
176	if (port_num <= 0 || port_num > MLX4_PORTS_NUM)
177		return -EINVAL;
178	if (mctx->port_query_cache[port_num - 1].valid) {
179		port_attr->link_layer =
180			mctx->
181			port_query_cache[port_num - 1].
182			link_layer;
183		port_attr->port_cap_flags =
184			mctx->
185			port_query_cache[port_num - 1].
186			caps;
187		return 0;
188	}
189	return mlx4_query_port(context, port_num,
190			       (struct ibv_port_attr *)port_attr);
191
192}
193
194struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
195{
196	struct ibv_alloc_pd       cmd;
197	struct mlx4_alloc_pd_resp resp;
198	struct mlx4_pd		 *pd;
199
200	pd = malloc(sizeof *pd);
201	if (!pd)
202		return NULL;
203
204	if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
205			     &resp.ibv_resp, sizeof resp)) {
206		free(pd);
207		return NULL;
208	}
209
210	pd->pdn = resp.pdn;
211
212	return &pd->ibv_pd;
213}
214
215int mlx4_free_pd(struct ibv_pd *pd)
216{
217	int ret;
218
219	ret = ibv_cmd_dealloc_pd(pd);
220	if (ret)
221		return ret;
222
223	free(to_mpd(pd));
224	return 0;
225}
226
227struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
228				struct ibv_xrcd_init_attr *attr)
229{
230	struct ibv_open_xrcd cmd;
231	struct ibv_open_xrcd_resp resp;
232	struct verbs_xrcd *xrcd;
233	int ret;
234
235	xrcd = calloc(1, sizeof *xrcd);
236	if (!xrcd)
237		return NULL;
238
239	ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
240				&cmd, sizeof cmd, &resp, sizeof resp);
241	if (ret)
242		goto err;
243
244	return &xrcd->xrcd;
245
246err:
247	free(xrcd);
248	return NULL;
249}
250
251int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
252{
253	struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
254	int ret;
255
256	ret = ibv_cmd_close_xrcd(xrcd);
257	if (!ret)
258		free(xrcd);
259
260	return ret;
261}
262
263struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
264			   int access)
265{
266	struct ibv_mr *mr;
267	struct ibv_reg_mr cmd;
268	struct ibv_reg_mr_resp resp;
269	int ret;
270
271	mr = malloc(sizeof *mr);
272	if (!mr)
273		return NULL;
274
275	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
276			     access, mr, &cmd, sizeof cmd,
277			     &resp, sizeof resp);
278	if (ret) {
279		free(mr);
280		return NULL;
281	}
282
283	return mr;
284}
285
286int mlx4_rereg_mr(struct ibv_mr *mr,
287		  int flags,
288		  struct ibv_pd *pd, void *addr,
289		  size_t length, int access)
290{
291	struct ibv_rereg_mr cmd;
292	struct ibv_rereg_mr_resp resp;
293
294	if (flags & IBV_REREG_MR_KEEP_VALID)
295		return ENOTSUP;
296
297	return ibv_cmd_rereg_mr(mr, flags, addr, length,
298				(uintptr_t)addr,
299				access, pd,
300				&cmd, sizeof(cmd),
301				&resp, sizeof(resp));
302}
303
304int mlx4_dereg_mr(struct ibv_mr *mr)
305{
306	int ret;
307
308	ret = ibv_cmd_dereg_mr(mr);
309	if (ret)
310		return ret;
311
312	free(mr);
313	return 0;
314}
315
316struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
317{
318	struct ibv_mw *mw;
319	struct ibv_alloc_mw cmd;
320	struct ibv_alloc_mw_resp resp;
321	int ret;
322
323	mw = calloc(1, sizeof(*mw));
324	if (!mw)
325		return NULL;
326
327	ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd),
328			     &resp, sizeof(resp));
329
330	if (ret) {
331		free(mw);
332		return NULL;
333	}
334
335	return mw;
336}
337
338int mlx4_dealloc_mw(struct ibv_mw *mw)
339{
340	int ret;
341	struct ibv_dealloc_mw cmd;
342
343	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
344	if (ret)
345		return ret;
346
347	free(mw);
348	return 0;
349}
350
351int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
352		 struct ibv_mw_bind *mw_bind)
353{
354	struct ibv_send_wr *bad_wr = NULL;
355	struct ibv_send_wr wr = { };
356	int ret;
357
358
359	wr.opcode = IBV_WR_BIND_MW;
360	wr.next = NULL;
361
362	wr.wr_id = mw_bind->wr_id;
363	wr.send_flags = mw_bind->send_flags;
364
365	wr.bind_mw.mw = mw;
366	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
367	wr.bind_mw.bind_info = mw_bind->bind_info;
368
369	ret = mlx4_post_send(qp, &wr, &bad_wr);
370
371	if (ret)
372		return ret;
373
374	/* updating the mw with the latest rkey. */
375	mw->rkey = wr.bind_mw.rkey;
376
377	return 0;
378}
379
380int align_queue_size(int req)
381{
382	int nent;
383
384	for (nent = 1; nent < req; nent <<= 1)
385		; /* nothing */
386
387	return nent;
388}
389
390enum {
391	CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS	|
392				       IBV_WC_EX_WITH_COMPLETION_TIMESTAMP
393};
394
395enum {
396	CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS
397};
398
399enum {
400	CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED
401};
402
403
404static int mlx4_cmd_create_cq(struct ibv_context *context,
405			      struct ibv_cq_init_attr_ex *cq_attr,
406			      struct mlx4_cq *cq)
407{
408	struct mlx4_create_cq      cmd = {};
409	struct mlx4_create_cq_resp resp = {};
410	int ret;
411
412	cmd.buf_addr = (uintptr_t) cq->buf.buf;
413	cmd.db_addr  = (uintptr_t) cq->set_ci_db;
414
415	ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel,
416				cq_attr->comp_vector,
417				ibv_cq_ex_to_cq(&cq->ibv_cq),
418				&cmd.ibv_cmd, sizeof(cmd),
419				&resp.ibv_resp, sizeof(resp));
420	if (!ret)
421		cq->cqn = resp.cqn;
422
423	return ret;
424
425}
426
427static int mlx4_cmd_create_cq_ex(struct ibv_context *context,
428				 struct ibv_cq_init_attr_ex *cq_attr,
429				 struct mlx4_cq *cq)
430{
431	struct mlx4_create_cq_ex      cmd = {};
432	struct mlx4_create_cq_resp_ex resp = {};
433	int ret;
434
435	cmd.buf_addr = (uintptr_t) cq->buf.buf;
436	cmd.db_addr  = (uintptr_t) cq->set_ci_db;
437
438	ret = ibv_cmd_create_cq_ex(context, cq_attr,
439				   &cq->ibv_cq, &cmd.ibv_cmd,
440				   sizeof(cmd.ibv_cmd),
441				   sizeof(cmd),
442				   &resp.ibv_resp,
443				   sizeof(resp.ibv_resp),
444				   sizeof(resp));
445	if (!ret)
446		cq->cqn = resp.cqn;
447
448	return ret;
449}
450
451static struct ibv_cq_ex *create_cq(struct ibv_context *context,
452				   struct ibv_cq_init_attr_ex *cq_attr,
453				   int cq_alloc_flags)
454{
455	struct mlx4_cq      *cq;
456	int                  ret;
457	struct mlx4_context *mctx = to_mctx(context);
458
459	/* Sanity check CQ size before proceeding */
460	if (cq_attr->cqe > 0x3fffff) {
461		errno = EINVAL;
462		return NULL;
463	}
464
465	if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) {
466		errno = ENOTSUP;
467		return NULL;
468	}
469
470	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
471	    cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) {
472		errno = ENOTSUP;
473		return NULL;
474	}
475
476	if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS)
477		return NULL;
478
479	/* mlx4 devices don't support slid and sl in cqe when completion
480	 * timestamp is enabled in the CQ
481	*/
482	if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) &&
483	    (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) {
484		errno = ENOTSUP;
485		return NULL;
486	}
487
488	cq = malloc(sizeof *cq);
489	if (!cq)
490		return NULL;
491
492	cq->cons_index = 0;
493
494	if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
495		goto err;
496
497	cq_attr->cqe = align_queue_size(cq_attr->cqe + 1);
498
499	if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size))
500		goto err;
501
502	cq->cqe_size = mctx->cqe_size;
503	cq->set_ci_db  = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
504	if (!cq->set_ci_db)
505		goto err_buf;
506
507	cq->arm_db     = cq->set_ci_db + 1;
508	*cq->arm_db    = 0;
509	cq->arm_sn     = 1;
510	*cq->set_ci_db = 0;
511	cq->flags = cq_alloc_flags;
512
513	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
514	    cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED)
515		cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED;
516
517	--cq_attr->cqe;
518	if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
519		ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq);
520	else
521		ret = mlx4_cmd_create_cq(context, cq_attr, cq);
522
523	if (ret)
524		goto err_db;
525
526
527	if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
528		mlx4_cq_fill_pfns(cq, cq_attr);
529
530	return &cq->ibv_cq;
531
532err_db:
533	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
534
535err_buf:
536	mlx4_free_buf(&cq->buf);
537
538err:
539	free(cq);
540
541	return NULL;
542}
543
544struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
545			      struct ibv_comp_channel *channel,
546			      int comp_vector)
547{
548	struct ibv_cq_ex *cq;
549	struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel,
550					      .comp_vector = comp_vector,
551					      .wc_flags = IBV_WC_STANDARD_FLAGS};
552
553	cq = create_cq(context, &cq_attr, 0);
554	return cq ? ibv_cq_ex_to_cq(cq) : NULL;
555}
556
557struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context,
558				    struct ibv_cq_init_attr_ex *cq_attr)
559{
560	/*
561	 * Make local copy since some attributes might be adjusted
562	 * for internal use.
563	 */
564	struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe,
565						.channel = cq_attr->channel,
566						.comp_vector = cq_attr->comp_vector,
567						.wc_flags = cq_attr->wc_flags,
568						.comp_mask = cq_attr->comp_mask,
569						.flags = cq_attr->flags};
570
571	return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED);
572}
573
574int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
575{
576	struct mlx4_cq *cq = to_mcq(ibcq);
577	struct mlx4_resize_cq cmd;
578	struct ibv_resize_cq_resp resp;
579	struct mlx4_buf buf;
580	int old_cqe, outst_cqe, ret;
581
582	/* Sanity check CQ size before proceeding */
583	if (cqe > 0x3fffff)
584		return EINVAL;
585
586	pthread_spin_lock(&cq->lock);
587
588	cqe = align_queue_size(cqe + 1);
589	if (cqe == ibcq->cqe + 1) {
590		ret = 0;
591		goto out;
592	}
593
594	/* Can't be smaller then the number of outstanding CQEs */
595	outst_cqe = mlx4_get_outstanding_cqes(cq);
596	if (cqe < outst_cqe + 1) {
597		ret = EINVAL;
598		goto out;
599	}
600
601	ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size);
602	if (ret)
603		goto out;
604
605	old_cqe = ibcq->cqe;
606	cmd.buf_addr = (uintptr_t) buf.buf;
607
608	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd,
609				&resp, sizeof resp);
610	if (ret) {
611		mlx4_free_buf(&buf);
612		goto out;
613	}
614
615	mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
616
617	mlx4_free_buf(&cq->buf);
618	cq->buf = buf;
619	mlx4_update_cons_index(cq);
620
621out:
622	pthread_spin_unlock(&cq->lock);
623	return ret;
624}
625
626int mlx4_destroy_cq(struct ibv_cq *cq)
627{
628	int ret;
629
630	ret = ibv_cmd_destroy_cq(cq);
631	if (ret)
632		return ret;
633
634	mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
635	mlx4_free_buf(&to_mcq(cq)->buf);
636	free(to_mcq(cq));
637
638	return 0;
639}
640
641struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
642				struct ibv_srq_init_attr *attr)
643{
644	struct mlx4_create_srq      cmd;
645	struct mlx4_create_srq_resp resp;
646	struct mlx4_srq		   *srq;
647	int			    ret;
648
649	/* Sanity check SRQ size before proceeding */
650	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
651		return NULL;
652
653	srq = malloc(sizeof *srq);
654	if (!srq)
655		return NULL;
656
657	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
658		goto err;
659
660	srq->max     = align_queue_size(attr->attr.max_wr + 1);
661	srq->max_gs  = attr->attr.max_sge;
662	srq->counter = 0;
663	srq->ext_srq = 0;
664
665	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
666		goto err;
667
668	srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
669	if (!srq->db)
670		goto err_free;
671
672	*srq->db = 0;
673
674	cmd.buf_addr = (uintptr_t) srq->buf.buf;
675	cmd.db_addr  = (uintptr_t) srq->db;
676
677	ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
678				 &cmd.ibv_cmd, sizeof cmd,
679				 &resp.ibv_resp, sizeof resp);
680	if (ret)
681		goto err_db;
682
683	return &srq->verbs_srq.srq;
684
685err_db:
686	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
687
688err_free:
689	free(srq->wrid);
690	mlx4_free_buf(&srq->buf);
691
692err:
693	free(srq);
694
695	return NULL;
696}
697
698struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
699				   struct ibv_srq_init_attr_ex *attr_ex)
700{
701	if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
702	    (attr_ex->srq_type == IBV_SRQT_BASIC))
703		return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
704	else if (attr_ex->srq_type == IBV_SRQT_XRC)
705		return mlx4_create_xrc_srq(context, attr_ex);
706
707	return NULL;
708}
709
710int mlx4_modify_srq(struct ibv_srq *srq,
711		     struct ibv_srq_attr *attr,
712		     int attr_mask)
713{
714	struct ibv_modify_srq cmd;
715
716	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
717}
718
719int mlx4_query_srq(struct ibv_srq *srq,
720		    struct ibv_srq_attr *attr)
721{
722	struct ibv_query_srq cmd;
723
724	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
725}
726
727int mlx4_destroy_srq(struct ibv_srq *srq)
728{
729	int ret;
730
731	if (to_msrq(srq)->ext_srq)
732		return mlx4_destroy_xrc_srq(srq);
733
734	ret = ibv_cmd_destroy_srq(srq);
735	if (ret)
736		return ret;
737
738	mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
739	mlx4_free_buf(&to_msrq(srq)->buf);
740	free(to_msrq(srq)->wrid);
741	free(to_msrq(srq));
742
743	return 0;
744}
745
746static int mlx4_cmd_create_qp_ex(struct ibv_context *context,
747				 struct ibv_qp_init_attr_ex *attr,
748				 struct mlx4_create_qp *cmd,
749				 struct mlx4_qp *qp)
750{
751	struct mlx4_create_qp_ex cmd_ex;
752	struct mlx4_create_qp_resp_ex resp;
753	int ret;
754
755	memset(&cmd_ex, 0, sizeof(cmd_ex));
756	memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle,
757	       offsetof(typeof(cmd->ibv_cmd), is_srq) +
758	       sizeof(cmd->ibv_cmd.is_srq) -
759	       offsetof(typeof(cmd->ibv_cmd), user_handle));
760
761	memcpy(&cmd_ex.drv_ex, &cmd->buf_addr,
762	       offsetof(typeof(*cmd), sq_no_prefetch) +
763	       sizeof(cmd->sq_no_prefetch) - sizeof(cmd->ibv_cmd));
764
765	ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
766				    sizeof(qp->verbs_qp), attr,
767				    &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd),
768				    sizeof(cmd_ex), &resp.ibv_resp,
769				    sizeof(resp.ibv_resp), sizeof(resp));
770	return ret;
771}
772
773enum {
774	MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD |
775					IBV_QP_INIT_ATTR_XRCD |
776					IBV_QP_INIT_ATTR_CREATE_FLAGS),
777};
778
779enum {
780	MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS),
781};
782
783struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
784				 struct ibv_qp_init_attr_ex *attr)
785{
786	struct mlx4_context *ctx = to_mctx(context);
787	struct mlx4_create_qp     cmd;
788	struct ibv_create_qp_resp resp;
789	struct mlx4_qp		 *qp;
790	int			  ret;
791
792	/* Sanity check QP size before proceeding */
793	if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */
794		if (attr->cap.max_send_wr  > ctx->max_qp_wr ||
795		    attr->cap.max_recv_wr  > ctx->max_qp_wr ||
796		    attr->cap.max_send_sge > ctx->max_sge   ||
797		    attr->cap.max_recv_sge > ctx->max_sge)
798			return NULL;
799	} else {
800		if (attr->cap.max_send_wr  > 65536 ||
801		    attr->cap.max_recv_wr  > 65536 ||
802		    attr->cap.max_send_sge > 64    ||
803		    attr->cap.max_recv_sge > 64)
804			return NULL;
805	}
806	if (attr->cap.max_inline_data > 1024)
807		return NULL;
808
809	if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK)
810		return NULL;
811
812	qp = calloc(1, sizeof *qp);
813	if (!qp)
814		return NULL;
815
816	if (attr->qp_type == IBV_QPT_XRC_RECV) {
817		attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
818	} else {
819		mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
820		/*
821		 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
822		 * allow HW to prefetch.
823		 */
824		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
825		qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
826	}
827
828	if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
829	    attr->qp_type == IBV_QPT_XRC_RECV) {
830		attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
831	} else {
832		qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
833		if (attr->cap.max_recv_sge < 1)
834			attr->cap.max_recv_sge = 1;
835		if (attr->cap.max_recv_wr < 1)
836			attr->cap.max_recv_wr = 1;
837	}
838
839	if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp))
840		goto err;
841
842	mlx4_init_qp_indices(qp);
843
844	if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
845	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
846		goto err_free;
847
848	if (attr->cap.max_recv_sge) {
849		qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
850		if (!qp->db)
851			goto err_free;
852
853		*qp->db = 0;
854		cmd.db_addr = (uintptr_t) qp->db;
855	} else {
856		cmd.db_addr = 0;
857	}
858
859	cmd.buf_addr	    = (uintptr_t) qp->buf.buf;
860	cmd.log_sq_stride   = qp->sq.wqe_shift;
861	for (cmd.log_sq_bb_count = 0;
862	     qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
863	     ++cmd.log_sq_bb_count)
864		; /* nothing */
865	cmd.sq_no_prefetch = 0;	/* OK for ABI 2: just a reserved field */
866	memset(cmd.reserved, 0, sizeof cmd.reserved);
867	pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
868
869	if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK)
870		ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp);
871	else
872		ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp,
873					   sizeof(qp->verbs_qp), attr,
874					   &cmd.ibv_cmd, sizeof(cmd), &resp,
875					   sizeof(resp));
876	if (ret)
877		goto err_rq_db;
878
879	if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
880		ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
881		if (ret)
882			goto err_destroy;
883	}
884	pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
885
886	qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
887	qp->rq.max_gs  = attr->cap.max_recv_sge;
888	if (attr->qp_type != IBV_QPT_XRC_RECV)
889		mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
890
891	qp->doorbell_qpn    = htobe32(qp->verbs_qp.qp.qp_num << 8);
892	if (attr->sq_sig_all)
893		qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE);
894	else
895		qp->sq_signal_bits = 0;
896
897	return &qp->verbs_qp.qp;
898
899err_destroy:
900	ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
901
902err_rq_db:
903	pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
904	if (attr->cap.max_recv_sge)
905		mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
906
907err_free:
908	free(qp->sq.wrid);
909	if (qp->rq.wqe_cnt)
910		free(qp->rq.wrid);
911	mlx4_free_buf(&qp->buf);
912
913err:
914	free(qp);
915
916	return NULL;
917}
918
919struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
920{
921	struct ibv_qp_init_attr_ex attr_ex;
922	struct ibv_qp *qp;
923
924	memcpy(&attr_ex, attr, sizeof *attr);
925	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
926	attr_ex.pd = pd;
927	qp = mlx4_create_qp_ex(pd->context, &attr_ex);
928	if (qp)
929		memcpy(attr, &attr_ex, sizeof *attr);
930	return qp;
931}
932
933struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
934{
935	struct ibv_open_qp cmd;
936	struct ibv_create_qp_resp resp;
937	struct mlx4_qp *qp;
938	int ret;
939
940	qp = calloc(1, sizeof *qp);
941	if (!qp)
942		return NULL;
943
944	ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
945			      &cmd, sizeof cmd, &resp, sizeof resp);
946	if (ret)
947		goto err;
948
949	return &qp->verbs_qp.qp;
950
951err:
952	free(qp);
953	return NULL;
954}
955
956int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
957		   int attr_mask,
958		   struct ibv_qp_init_attr *init_attr)
959{
960	struct ibv_query_qp cmd;
961	struct mlx4_qp *qp = to_mqp(ibqp);
962	int ret;
963
964	ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd);
965	if (ret)
966		return ret;
967
968	init_attr->cap.max_send_wr     = qp->sq.max_post;
969	init_attr->cap.max_send_sge    = qp->sq.max_gs;
970	init_attr->cap.max_inline_data = qp->max_inline_data;
971
972	attr->cap = init_attr->cap;
973
974	return 0;
975}
976
977int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
978		    int attr_mask)
979{
980	struct ibv_modify_qp cmd = {};
981	struct ibv_port_attr port_attr;
982	struct mlx4_qp *mqp = to_mqp(qp);
983	struct ibv_device_attr device_attr;
984	int ret;
985
986	memset(&device_attr, 0, sizeof(device_attr));
987	if (attr_mask & IBV_QP_PORT) {
988		ret = ibv_query_port(qp->context, attr->port_num,
989				     &port_attr);
990		if (ret)
991			return ret;
992		mqp->link_layer = port_attr.link_layer;
993
994		ret = ibv_query_device(qp->context, &device_attr);
995		if (ret)
996			return ret;
997
998		switch(qp->qp_type) {
999		case IBV_QPT_UD:
1000			if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) &&
1001			    (device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM))
1002				mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB |
1003						MLX4_RX_CSUM_VALID;
1004			break;
1005		case IBV_QPT_RAW_PACKET:
1006			if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) &&
1007			    (device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM))
1008				mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH |
1009						MLX4_RX_CSUM_VALID;
1010			break;
1011		default:
1012			break;
1013		}
1014
1015	}
1016
1017	if (qp->state == IBV_QPS_RESET &&
1018	    attr_mask & IBV_QP_STATE   &&
1019	    attr->qp_state == IBV_QPS_INIT) {
1020		mlx4_qp_init_sq_ownership(to_mqp(qp));
1021	}
1022
1023	ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd);
1024
1025	if (!ret		       &&
1026	    (attr_mask & IBV_QP_STATE) &&
1027	    attr->qp_state == IBV_QPS_RESET) {
1028		if (qp->recv_cq)
1029			mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
1030				      qp->srq ? to_msrq(qp->srq) : NULL);
1031		if (qp->send_cq && qp->send_cq != qp->recv_cq)
1032			mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
1033
1034		mlx4_init_qp_indices(to_mqp(qp));
1035		if (to_mqp(qp)->rq.wqe_cnt)
1036			*to_mqp(qp)->db = 0;
1037	}
1038
1039	return ret;
1040}
1041
1042static void mlx4_lock_cqs(struct ibv_qp *qp)
1043{
1044	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
1045	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
1046
1047	if (!qp->send_cq || !qp->recv_cq) {
1048		if (qp->send_cq)
1049			pthread_spin_lock(&send_cq->lock);
1050		else if (qp->recv_cq)
1051			pthread_spin_lock(&recv_cq->lock);
1052	} else if (send_cq == recv_cq) {
1053		pthread_spin_lock(&send_cq->lock);
1054	} else if (send_cq->cqn < recv_cq->cqn) {
1055		pthread_spin_lock(&send_cq->lock);
1056		pthread_spin_lock(&recv_cq->lock);
1057	} else {
1058		pthread_spin_lock(&recv_cq->lock);
1059		pthread_spin_lock(&send_cq->lock);
1060	}
1061}
1062
1063static void mlx4_unlock_cqs(struct ibv_qp *qp)
1064{
1065	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
1066	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
1067
1068
1069	if (!qp->send_cq || !qp->recv_cq) {
1070		if (qp->send_cq)
1071			pthread_spin_unlock(&send_cq->lock);
1072		else if (qp->recv_cq)
1073			pthread_spin_unlock(&recv_cq->lock);
1074	} else if (send_cq == recv_cq) {
1075		pthread_spin_unlock(&send_cq->lock);
1076	} else if (send_cq->cqn < recv_cq->cqn) {
1077		pthread_spin_unlock(&recv_cq->lock);
1078		pthread_spin_unlock(&send_cq->lock);
1079	} else {
1080		pthread_spin_unlock(&send_cq->lock);
1081		pthread_spin_unlock(&recv_cq->lock);
1082	}
1083}
1084
1085int mlx4_destroy_qp(struct ibv_qp *ibqp)
1086{
1087	struct mlx4_qp *qp = to_mqp(ibqp);
1088	int ret;
1089
1090	pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex);
1091	ret = ibv_cmd_destroy_qp(ibqp);
1092	if (ret) {
1093		pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
1094		return ret;
1095	}
1096
1097	mlx4_lock_cqs(ibqp);
1098
1099	if (ibqp->recv_cq)
1100		__mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
1101				ibqp->srq ? to_msrq(ibqp->srq) : NULL);
1102	if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
1103		__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
1104
1105	if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
1106		mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
1107
1108	mlx4_unlock_cqs(ibqp);
1109	pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
1110
1111	if (qp->rq.wqe_cnt) {
1112		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
1113		free(qp->rq.wrid);
1114	}
1115	if (qp->sq.wqe_cnt)
1116		free(qp->sq.wrid);
1117	mlx4_free_buf(&qp->buf);
1118	free(qp);
1119
1120	return 0;
1121}
1122
1123static int link_local_gid(const union ibv_gid *gid)
1124{
1125	uint32_t *tmp = (uint32_t *)gid->raw;
1126	uint32_t hi = tmp[0];
1127	uint32_t lo = tmp[1];
1128
1129	if (hi == htobe32(0xfe800000) && lo == 0)
1130		return 1;
1131
1132	return 0;
1133}
1134
1135static int is_multicast_gid(const union ibv_gid *gid)
1136{
1137	return gid->raw[0] == 0xff;
1138}
1139
1140static uint16_t get_vlan_id(union ibv_gid *gid)
1141{
1142	uint16_t vid;
1143	vid = gid->raw[11] << 8 | gid->raw[12];
1144	return vid < 0x1000 ? vid : 0xffff;
1145}
1146
1147static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah,
1148				  struct ibv_ah_attr *attr)
1149{
1150	int err, i;
1151	uint16_t vid;
1152	union ibv_gid sgid;
1153
1154	if (link_local_gid(&attr->grh.dgid)) {
1155		memcpy(ah->mac, &attr->grh.dgid.raw[8], 3);
1156		memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3);
1157		ah->mac[0] ^= 2;
1158
1159		vid = get_vlan_id(&attr->grh.dgid);
1160	} else if (is_multicast_gid(&attr->grh.dgid)) {
1161		ah->mac[0] = 0x33;
1162		ah->mac[1] = 0x33;
1163		for (i = 2; i < 6; ++i)
1164			ah->mac[i] = attr->grh.dgid.raw[i + 10];
1165
1166		err = ibv_query_gid(pd->context, attr->port_num,
1167				    attr->grh.sgid_index, &sgid);
1168		if (err)
1169			return err;
1170
1171		ah->av.dlid = htobe16(0xc000);
1172		ah->av.port_pd |= htobe32(1 << 31);
1173
1174		vid = get_vlan_id(&sgid);
1175	} else
1176		return 1;
1177
1178	if (vid != 0xffff) {
1179		ah->av.port_pd |= htobe32(1 << 29);
1180		ah->vlan = vid | ((attr->sl & 7) << 13);
1181	}
1182
1183	return 0;
1184}
1185
1186struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
1187{
1188	struct mlx4_ah *ah;
1189	struct ibv_port_attr port_attr;
1190
1191	if (query_port_cache(pd->context, attr->port_num, &port_attr))
1192		return NULL;
1193
1194	ah = malloc(sizeof *ah);
1195	if (!ah)
1196		return NULL;
1197
1198	memset(&ah->av, 0, sizeof ah->av);
1199
1200	ah->av.port_pd   = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24));
1201
1202	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
1203		ah->av.g_slid = attr->src_path_bits;
1204		ah->av.dlid   = htobe16(attr->dlid);
1205		ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28);
1206	} else
1207		ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29);
1208
1209	if (attr->static_rate) {
1210		ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
1211		/* XXX check rate cap? */
1212	}
1213	if (attr->is_global) {
1214		ah->av.g_slid   |= 0x80;
1215		ah->av.gid_index = attr->grh.sgid_index;
1216		ah->av.hop_limit = attr->grh.hop_limit;
1217		ah->av.sl_tclass_flowlabel |=
1218			htobe32((attr->grh.traffic_class << 20) |
1219				    attr->grh.flow_label);
1220		memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
1221	}
1222
1223	if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
1224		if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) {
1225			uint16_t vid;
1226
1227			if (ibv_resolve_eth_l2_from_gid(pd->context, attr,
1228							ah->mac, &vid)) {
1229				free(ah);
1230				return NULL;
1231			}
1232
1233			if (vid <= 0xfff) {
1234				ah->av.port_pd |= htobe32(1 << 29);
1235				ah->vlan = vid |
1236					((attr->sl & 7) << 13);
1237			}
1238
1239		} else {
1240			if (mlx4_resolve_grh_to_l2(pd, ah, attr)) {
1241				free(ah);
1242				return NULL;
1243			}
1244		}
1245	}
1246
1247	return &ah->ibv_ah;
1248}
1249
1250int mlx4_destroy_ah(struct ibv_ah *ah)
1251{
1252	free(to_mah(ah));
1253
1254	return 0;
1255}
1256