1/*
2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4 * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35#include <config.h>
36
37#include <stdlib.h>
38#include <pthread.h>
39#include <string.h>
40#include <errno.h>
41
42#include "mlx4.h"
43#include "doorbell.h"
44#include "wqe.h"
45
46static const uint32_t mlx4_ib_opcode[] = {
47	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
48	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
49	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
50	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
51	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
52	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
53	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
54	[IBV_WR_LOCAL_INV]		= MLX4_OPCODE_LOCAL_INVAL,
55	[IBV_WR_BIND_MW]		= MLX4_OPCODE_BIND_MW,
56	[IBV_WR_SEND_WITH_INV]		= MLX4_OPCODE_SEND_INVAL,
57};
58
59static void *get_recv_wqe(struct mlx4_qp *qp, int n)
60{
61	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
62}
63
64static void *get_send_wqe(struct mlx4_qp *qp, int n)
65{
66	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
67}
68
69/*
70 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
71 * first four bytes of every 64 byte chunk with 0xffffffff, except for
72 * the very first chunk of the WQE.
73 */
74static void stamp_send_wqe(struct mlx4_qp *qp, int n)
75{
76	uint32_t *wqe = get_send_wqe(qp, n);
77	int i;
78	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
79
80	for (i = 16; i < ds; i += 16)
81		wqe[i] = 0xffffffff;
82}
83
84void mlx4_init_qp_indices(struct mlx4_qp *qp)
85{
86	qp->sq.head	 = 0;
87	qp->sq.tail	 = 0;
88	qp->rq.head	 = 0;
89	qp->rq.tail	 = 0;
90}
91
92void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
93{
94	struct mlx4_wqe_ctrl_seg *ctrl;
95	int i;
96
97	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
98		ctrl = get_send_wqe(qp, i);
99		ctrl->owner_opcode = htobe32(1 << 31);
100		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
101
102		stamp_send_wqe(qp, i);
103	}
104}
105
106static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
107{
108	unsigned cur;
109
110	cur = wq->head - wq->tail;
111	if (cur + nreq < wq->max_post)
112		return 0;
113
114	pthread_spin_lock(&cq->lock);
115	cur = wq->head - wq->tail;
116	pthread_spin_unlock(&cq->lock);
117
118	return cur + nreq >= wq->max_post;
119}
120
121static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
122{
123	int acc = wr->bind_mw.bind_info.mw_access_flags;
124	bseg->flags1 = 0;
125	if (acc & IBV_ACCESS_REMOTE_ATOMIC)
126		bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
127	if (acc & IBV_ACCESS_REMOTE_WRITE)
128		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
129	if (acc & IBV_ACCESS_REMOTE_READ)
130		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
131
132	bseg->flags2 = 0;
133	if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
134		bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
135	if (acc & IBV_ACCESS_ZERO_BASED)
136		bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
137
138	bseg->new_rkey = htobe32(wr->bind_mw.rkey);
139	bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
140	bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
141	bseg->length = htobe64(wr->bind_mw.bind_info.length);
142}
143
144static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
145		uint32_t rkey)
146{
147	iseg->mem_key	= htobe32(rkey);
148
149	iseg->reserved1    = 0;
150	iseg->reserved2    = 0;
151	iseg->reserved3[0] = 0;
152	iseg->reserved3[1] = 0;
153}
154
155static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
156				 uint64_t remote_addr, uint32_t rkey)
157{
158	rseg->raddr    = htobe64(remote_addr);
159	rseg->rkey     = htobe32(rkey);
160	rseg->reserved = 0;
161}
162
163static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
164{
165	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166		aseg->swap_add = htobe64(wr->wr.atomic.swap);
167		aseg->compare  = htobe64(wr->wr.atomic.compare_add);
168	} else {
169		aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
170		aseg->compare  = 0;
171	}
172
173}
174
175static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
176			     struct ibv_send_wr *wr)
177{
178	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
179	dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
180	dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
181	dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
182	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
183}
184
185static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
186{
187	dseg->byte_count = htobe32(sg->length);
188	dseg->lkey       = htobe32(sg->lkey);
189	dseg->addr       = htobe64(sg->addr);
190}
191
192static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
193{
194	dseg->lkey       = htobe32(sg->lkey);
195	dseg->addr       = htobe64(sg->addr);
196
197	/*
198	 * Need a barrier here before writing the byte_count field to
199	 * make sure that all the data is visible before the
200	 * byte_count field is set.  Otherwise, if the segment begins
201	 * a new cacheline, the HCA prefetcher could grab the 64-byte
202	 * chunk and get a valid (!= * 0xffffffff) byte count but
203	 * stale data, and end up sending the wrong data.
204	 */
205	udma_to_device_barrier();
206
207	if (likely(sg->length))
208		dseg->byte_count = htobe32(sg->length);
209	else
210		dseg->byte_count = htobe32(0x80000000);
211}
212
213int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
214			  struct ibv_send_wr **bad_wr)
215{
216	struct mlx4_context *ctx;
217	struct mlx4_qp *qp = to_mqp(ibqp);
218	void *wqe;
219	struct mlx4_wqe_ctrl_seg *ctrl = NULL;
220	int ind;
221	int nreq;
222	int inl = 0;
223	int ret = 0;
224	int size = 0;
225	int i;
226
227	pthread_spin_lock(&qp->sq.lock);
228
229	/* XXX check that state is OK to post send */
230
231	ind = qp->sq.head;
232
233	for (nreq = 0; wr; ++nreq, wr = wr->next) {
234		if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
235			ret = ENOMEM;
236			*bad_wr = wr;
237			goto out;
238		}
239
240		if (wr->num_sge > qp->sq.max_gs) {
241			ret = ENOMEM;
242			*bad_wr = wr;
243			goto out;
244		}
245
246		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
247			ret = EINVAL;
248			*bad_wr = wr;
249			goto out;
250		}
251
252		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
253		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
254
255		ctrl->srcrb_flags =
256			(wr->send_flags & IBV_SEND_SIGNALED ?
257			 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
258			(wr->send_flags & IBV_SEND_SOLICITED ?
259			 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
260			qp->sq_signal_bits;
261
262		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
263		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
264			ctrl->imm = wr->imm_data;
265		else
266			ctrl->imm = 0;
267
268		wqe += sizeof *ctrl;
269		size = sizeof *ctrl / 16;
270
271		switch (ibqp->qp_type) {
272		case IBV_QPT_XRC_SEND:
273			ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
274			/* fall through */
275		case IBV_QPT_RC:
276		case IBV_QPT_UC:
277			switch (wr->opcode) {
278			case IBV_WR_ATOMIC_CMP_AND_SWP:
279			case IBV_WR_ATOMIC_FETCH_AND_ADD:
280				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
281					      wr->wr.atomic.rkey);
282				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
283
284				set_atomic_seg(wqe, wr);
285				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
286				size += (sizeof (struct mlx4_wqe_raddr_seg) +
287					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
288
289				break;
290
291			case IBV_WR_RDMA_READ:
292				inl = 1;
293				/* fall through */
294			case IBV_WR_RDMA_WRITE:
295			case IBV_WR_RDMA_WRITE_WITH_IMM:
296				if (!wr->num_sge)
297					inl = 1;
298				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
299					      wr->wr.rdma.rkey);
300				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
301				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
302
303				break;
304			case IBV_WR_LOCAL_INV:
305				ctrl->srcrb_flags |=
306					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
307				set_local_inv_seg(wqe, wr->imm_data);
308				wqe  += sizeof
309					(struct mlx4_wqe_local_inval_seg);
310				size += sizeof
311					(struct mlx4_wqe_local_inval_seg) / 16;
312				break;
313			case IBV_WR_BIND_MW:
314				ctrl->srcrb_flags |=
315					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
316				set_bind_seg(wqe, wr);
317				wqe  += sizeof
318					(struct mlx4_wqe_bind_seg);
319				size += sizeof
320					(struct mlx4_wqe_bind_seg) / 16;
321				break;
322			case IBV_WR_SEND_WITH_INV:
323				ctrl->imm = htobe32(wr->imm_data);
324				break;
325
326			default:
327				/* No extra segments required for sends */
328				break;
329			}
330			break;
331
332		case IBV_QPT_UD:
333			set_datagram_seg(wqe, wr);
334			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
335			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
336
337			if (wr->send_flags & IBV_SEND_IP_CSUM) {
338				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
339					ret = EINVAL;
340					*bad_wr = wr;
341					goto out;
342				}
343				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
344							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
345			}
346			break;
347
348		case IBV_QPT_RAW_PACKET:
349			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
350			 * to indicate that no icrc should be calculated */
351			ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
352			if (wr->send_flags & IBV_SEND_IP_CSUM) {
353				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
354					ret = EINVAL;
355					*bad_wr = wr;
356					goto out;
357				}
358				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
359							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
360			}
361			break;
362
363		default:
364			break;
365		}
366
367		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
368			struct mlx4_wqe_inline_seg *seg;
369			void *addr;
370			int len, seg_len;
371			int num_seg;
372			int off, to_copy;
373
374			inl = 0;
375
376			seg = wqe;
377			wqe += sizeof *seg;
378			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
379			num_seg = 0;
380			seg_len = 0;
381
382			for (i = 0; i < wr->num_sge; ++i) {
383				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
384				len  = wr->sg_list[i].length;
385				inl += len;
386
387				if (inl > qp->max_inline_data) {
388					inl = 0;
389					ret = ENOMEM;
390					*bad_wr = wr;
391					goto out;
392				}
393
394				while (len >= MLX4_INLINE_ALIGN - off) {
395					to_copy = MLX4_INLINE_ALIGN - off;
396					memcpy(wqe, addr, to_copy);
397					len -= to_copy;
398					wqe += to_copy;
399					addr += to_copy;
400					seg_len += to_copy;
401					udma_to_device_barrier(); /* see comment below */
402					seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
403					seg_len = 0;
404					seg = wqe;
405					wqe += sizeof *seg;
406					off = sizeof *seg;
407					++num_seg;
408				}
409
410				memcpy(wqe, addr, len);
411				wqe += len;
412				seg_len += len;
413				off += len;
414			}
415
416			if (seg_len) {
417				++num_seg;
418				/*
419				 * Need a barrier here to make sure
420				 * all the data is visible before the
421				 * byte_count field is set.  Otherwise
422				 * the HCA prefetcher could grab the
423				 * 64-byte chunk with this inline
424				 * segment and get a valid (!=
425				 * 0xffffffff) byte count but stale
426				 * data, and end up sending the wrong
427				 * data.
428				 */
429				udma_to_device_barrier();
430				seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
431			}
432
433			size += (inl + num_seg * sizeof * seg + 15) / 16;
434		} else {
435			struct mlx4_wqe_data_seg *seg = wqe;
436
437			for (i = wr->num_sge - 1; i >= 0 ; --i)
438				set_data_seg(seg + i, wr->sg_list + i);
439
440			size += wr->num_sge * (sizeof *seg / 16);
441		}
442
443		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
444				    MLX4_WQE_CTRL_FENCE : 0) | size;
445
446		/*
447		 * Make sure descriptor is fully written before
448		 * setting ownership bit (because HW can start
449		 * executing as soon as we do).
450		 */
451		udma_to_device_barrier();
452
453		ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
454			(ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
455
456		/*
457		 * We can improve latency by not stamping the last
458		 * send queue WQE until after ringing the doorbell, so
459		 * only stamp here if there are still more WQEs to post.
460		 */
461		if (wr->next)
462			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
463				       (qp->sq.wqe_cnt - 1));
464
465		++ind;
466	}
467
468out:
469	ctx = to_mctx(ibqp->context);
470
471	if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
472		ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
473
474		ctrl->bf_qpn |= qp->doorbell_qpn;
475		++qp->sq.head;
476		/*
477		 * Make sure that descriptor is written to memory
478		 * before writing to BlueFlame page.
479		 */
480		mmio_wc_spinlock(&ctx->bf_lock);
481
482		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
483			     align(size * 16, 64));
484		/* Flush before toggling bf_offset to be latency oriented */
485		mmio_flush_writes();
486
487		ctx->bf_offset ^= ctx->bf_buf_size;
488
489		pthread_spin_unlock(&ctx->bf_lock);
490	} else if (nreq) {
491		qp->sq.head += nreq;
492
493		/*
494		 * Make sure that descriptors are written before
495		 * doorbell record.
496		 */
497		udma_to_device_barrier();
498
499		mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
500			    qp->doorbell_qpn);
501	}
502
503	if (nreq)
504		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
505			       (qp->sq.wqe_cnt - 1));
506
507	pthread_spin_unlock(&qp->sq.lock);
508
509	return ret;
510}
511
512int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
513		   struct ibv_recv_wr **bad_wr)
514{
515	struct mlx4_qp *qp = to_mqp(ibqp);
516	struct mlx4_wqe_data_seg *scat;
517	int ret = 0;
518	int nreq;
519	int ind;
520	int i;
521
522	pthread_spin_lock(&qp->rq.lock);
523
524	/* XXX check that state is OK to post receive */
525
526	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
527
528	for (nreq = 0; wr; ++nreq, wr = wr->next) {
529		if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
530			ret = ENOMEM;
531			*bad_wr = wr;
532			goto out;
533		}
534
535		if (wr->num_sge > qp->rq.max_gs) {
536			ret = ENOMEM;
537			*bad_wr = wr;
538			goto out;
539		}
540
541		scat = get_recv_wqe(qp, ind);
542
543		for (i = 0; i < wr->num_sge; ++i)
544			__set_data_seg(scat + i, wr->sg_list + i);
545
546		if (i < qp->rq.max_gs) {
547			scat[i].byte_count = 0;
548			scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
549			scat[i].addr       = 0;
550		}
551
552		qp->rq.wrid[ind] = wr->wr_id;
553
554		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
555	}
556
557out:
558	if (nreq) {
559		qp->rq.head += nreq;
560
561		/*
562		 * Make sure that descriptors are written before
563		 * doorbell record.
564		 */
565		udma_to_device_barrier();
566
567		*qp->db = htobe32(qp->rq.head & 0xffff);
568	}
569
570	pthread_spin_unlock(&qp->rq.lock);
571
572	return ret;
573}
574
575static int num_inline_segs(int data, enum ibv_qp_type type)
576{
577	/*
578	 * Inline data segments are not allowed to cross 64 byte
579	 * boundaries.  For UD QPs, the data segments always start
580	 * aligned to 64 bytes (16 byte control segment + 48 byte
581	 * datagram segment); for other QPs, there will be a 16 byte
582	 * control segment and possibly a 16 byte remote address
583	 * segment, so in the worst case there will be only 32 bytes
584	 * available for the first data segment.
585	 */
586	if (type == IBV_QPT_UD)
587		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
588			 sizeof (struct mlx4_wqe_datagram_seg)) %
589			MLX4_INLINE_ALIGN;
590	else
591		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
592			 sizeof (struct mlx4_wqe_raddr_seg)) %
593			MLX4_INLINE_ALIGN;
594
595	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
596		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
597}
598
599void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
600			   struct mlx4_qp *qp)
601{
602	int size;
603	int max_sq_sge;
604
605	max_sq_sge	 = align(cap->max_inline_data +
606				 num_inline_segs(cap->max_inline_data, type) *
607				 sizeof (struct mlx4_wqe_inline_seg),
608				 sizeof (struct mlx4_wqe_data_seg)) /
609		sizeof (struct mlx4_wqe_data_seg);
610	if (max_sq_sge < cap->max_send_sge)
611		max_sq_sge = cap->max_send_sge;
612
613	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
614	switch (type) {
615	case IBV_QPT_UD:
616		size += sizeof (struct mlx4_wqe_datagram_seg);
617		break;
618
619	case IBV_QPT_UC:
620		size += sizeof (struct mlx4_wqe_raddr_seg);
621		break;
622
623	case IBV_QPT_XRC_SEND:
624	case IBV_QPT_RC:
625		size += sizeof (struct mlx4_wqe_raddr_seg);
626		/*
627		 * An atomic op will require an atomic segment, a
628		 * remote address segment and one scatter entry.
629		 */
630		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
631			    sizeof (struct mlx4_wqe_raddr_seg) +
632			    sizeof (struct mlx4_wqe_data_seg)))
633			size = (sizeof (struct mlx4_wqe_atomic_seg) +
634				sizeof (struct mlx4_wqe_raddr_seg) +
635				sizeof (struct mlx4_wqe_data_seg));
636		break;
637
638	default:
639		break;
640	}
641
642	/* Make sure that we have enough space for a bind request */
643	if (size < sizeof (struct mlx4_wqe_bind_seg))
644		size = sizeof (struct mlx4_wqe_bind_seg);
645
646	size += sizeof (struct mlx4_wqe_ctrl_seg);
647
648	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
649	     qp->sq.wqe_shift++)
650		; /* nothing */
651}
652
653int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
654		       enum ibv_qp_type type, struct mlx4_qp *qp)
655{
656	qp->rq.max_gs	 = cap->max_recv_sge;
657
658	if (qp->sq.wqe_cnt) {
659		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
660		if (!qp->sq.wrid)
661			return -1;
662	}
663
664	if (qp->rq.wqe_cnt) {
665		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
666		if (!qp->rq.wrid) {
667			free(qp->sq.wrid);
668			return -1;
669		}
670	}
671
672	for (qp->rq.wqe_shift = 4;
673	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
674	     qp->rq.wqe_shift++)
675		; /* nothing */
676
677	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
678		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
679	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
680		qp->rq.offset = 0;
681		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
682	} else {
683		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
684		qp->sq.offset = 0;
685	}
686
687	if (qp->buf_size) {
688		if (mlx4_alloc_buf(&qp->buf,
689				   align(qp->buf_size, to_mdev(context->device)->page_size),
690				   to_mdev(context->device)->page_size)) {
691			free(qp->sq.wrid);
692			free(qp->rq.wrid);
693			return -1;
694		}
695
696		memset(qp->buf.buf, 0, qp->buf_size);
697	} else {
698		qp->buf.buf = NULL;
699	}
700
701	return 0;
702}
703
704void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
705		       enum ibv_qp_type type)
706{
707	int wqe_size;
708
709	wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
710	switch (type) {
711	case IBV_QPT_UD:
712		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
713		break;
714
715	case IBV_QPT_XRC_SEND:
716	case IBV_QPT_UC:
717	case IBV_QPT_RC:
718		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
719		break;
720
721	default:
722		break;
723	}
724
725	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
726	cap->max_send_sge    = qp->sq.max_gs;
727	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
728	cap->max_send_wr     = qp->sq.max_post;
729
730	/*
731	 * Inline data segments can't cross a 64 byte boundary.  So
732	 * subtract off one segment header for each 64-byte chunk,
733	 * taking into account the fact that wqe_size will be 32 mod
734	 * 64 for non-UD QPs.
735	 */
736	qp->max_inline_data  = wqe_size -
737		sizeof (struct mlx4_wqe_inline_seg) *
738		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
739	cap->max_inline_data = qp->max_inline_data;
740}
741
742struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
743{
744	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
745
746	if (ctx->qp_table[tind].refcnt)
747		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
748	else
749		return NULL;
750}
751
752int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
753{
754	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
755
756	if (!ctx->qp_table[tind].refcnt) {
757		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
758						   sizeof (struct mlx4_qp *));
759		if (!ctx->qp_table[tind].table)
760			return -1;
761	}
762
763	++ctx->qp_table[tind].refcnt;
764	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
765	return 0;
766}
767
768void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
769{
770	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
771
772	if (!--ctx->qp_table[tind].refcnt)
773		free(ctx->qp_table[tind].table);
774	else
775		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
776}
777