1/*
2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#if HAVE_CONFIG_H
35#  include <config.h>
36#endif /* HAVE_CONFIG_H */
37
38#include <stdlib.h>
39#include <netinet/in.h>
40#include <pthread.h>
41#include <string.h>
42
43#include "mthca.h"
44#include "doorbell.h"
45#include "wqe.h"
46
47enum {
48	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
49};
50
51static const uint8_t mthca_opcode[] = {
52	[IBV_WR_SEND]                 = MTHCA_OPCODE_SEND,
53	[IBV_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
54	[IBV_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
55	[IBV_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
56	[IBV_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
57	[IBV_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
58	[IBV_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
59};
60
61static void *get_recv_wqe(struct mthca_qp *qp, int n)
62{
63	return qp->buf.buf + (n << qp->rq.wqe_shift);
64}
65
66static void *get_send_wqe(struct mthca_qp *qp, int n)
67{
68	return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
69}
70
71void mthca_init_qp_indices(struct mthca_qp *qp)
72{
73	qp->sq.next_ind  = 0;
74	qp->sq.last_comp = qp->sq.max - 1;
75	qp->sq.head    	 = 0;
76	qp->sq.tail    	 = 0;
77	qp->sq.last      = get_send_wqe(qp, qp->sq.max - 1);
78
79	qp->rq.next_ind	 = 0;
80	qp->rq.last_comp = qp->rq.max - 1;
81	qp->rq.head    	 = 0;
82	qp->rq.tail    	 = 0;
83	qp->rq.last      = get_recv_wqe(qp, qp->rq.max - 1);
84}
85
86static inline int wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq)
87{
88	unsigned cur;
89
90	cur = wq->head - wq->tail;
91	if (cur + nreq < wq->max)
92		return 0;
93
94	pthread_spin_lock(&cq->lock);
95	cur = wq->head - wq->tail;
96	pthread_spin_unlock(&cq->lock);
97
98	return cur + nreq >= wq->max;
99}
100
101int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
102			  struct ibv_send_wr **bad_wr)
103{
104	struct mthca_qp *qp = to_mqp(ibqp);
105	void *wqe, *prev_wqe;
106	int ind;
107	int nreq;
108	int ret = 0;
109	int size;
110	int size0 = 0;
111	int i;
112	/*
113	 * f0 and op0 cannot be used unless nreq > 0, which means this
114	 * function makes it through the loop at least once.  So the
115	 * code inside the if (!size0) will be executed, and f0 and
116	 * op0 will be initialized.  So any gcc warning about "may be
117	 * used unitialized" is bogus.
118	 */
119	uint32_t f0;
120	uint32_t op0;
121
122	pthread_spin_lock(&qp->sq.lock);
123
124	ind = qp->sq.next_ind;
125
126	for (nreq = 0; wr; ++nreq, wr = wr->next) {
127		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
128			ret = -1;
129			*bad_wr = wr;
130			goto out;
131		}
132
133		wqe = get_send_wqe(qp, ind);
134		prev_wqe = qp->sq.last;
135		qp->sq.last = wqe;
136
137		((struct mthca_next_seg *) wqe)->nda_op = 0;
138		((struct mthca_next_seg *) wqe)->ee_nds = 0;
139		((struct mthca_next_seg *) wqe)->flags =
140			((wr->send_flags & IBV_SEND_SIGNALED) ?
141			 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
142			((wr->send_flags & IBV_SEND_SOLICITED) ?
143			 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
144			htonl(1);
145		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
146		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
147			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
148
149		wqe += sizeof (struct mthca_next_seg);
150		size = sizeof (struct mthca_next_seg) / 16;
151
152		switch (ibqp->qp_type) {
153		case IBV_QPT_RC:
154			switch (wr->opcode) {
155			case IBV_WR_ATOMIC_CMP_AND_SWP:
156			case IBV_WR_ATOMIC_FETCH_AND_ADD:
157				((struct mthca_raddr_seg *) wqe)->raddr =
158					htonll(wr->wr.atomic.remote_addr);
159				((struct mthca_raddr_seg *) wqe)->rkey =
160					htonl(wr->wr.atomic.rkey);
161				((struct mthca_raddr_seg *) wqe)->reserved = 0;
162
163				wqe += sizeof (struct mthca_raddr_seg);
164
165				if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166					((struct mthca_atomic_seg *) wqe)->swap_add =
167						htonll(wr->wr.atomic.swap);
168					((struct mthca_atomic_seg *) wqe)->compare =
169						htonll(wr->wr.atomic.compare_add);
170				} else {
171					((struct mthca_atomic_seg *) wqe)->swap_add =
172						htonll(wr->wr.atomic.compare_add);
173					((struct mthca_atomic_seg *) wqe)->compare = 0;
174				}
175
176				wqe += sizeof (struct mthca_atomic_seg);
177				size += (sizeof (struct mthca_raddr_seg) +
178					 sizeof (struct mthca_atomic_seg)) / 16;
179				break;
180
181			case IBV_WR_RDMA_WRITE:
182			case IBV_WR_RDMA_WRITE_WITH_IMM:
183			case IBV_WR_RDMA_READ:
184				((struct mthca_raddr_seg *) wqe)->raddr =
185					htonll(wr->wr.rdma.remote_addr);
186				((struct mthca_raddr_seg *) wqe)->rkey =
187					htonl(wr->wr.rdma.rkey);
188				((struct mthca_raddr_seg *) wqe)->reserved = 0;
189				wqe += sizeof (struct mthca_raddr_seg);
190				size += sizeof (struct mthca_raddr_seg) / 16;
191				break;
192
193			default:
194				/* No extra segments required for sends */
195				break;
196			}
197
198			break;
199
200		case IBV_QPT_UC:
201			switch (wr->opcode) {
202			case IBV_WR_RDMA_WRITE:
203			case IBV_WR_RDMA_WRITE_WITH_IMM:
204				((struct mthca_raddr_seg *) wqe)->raddr =
205					htonll(wr->wr.rdma.remote_addr);
206				((struct mthca_raddr_seg *) wqe)->rkey =
207					htonl(wr->wr.rdma.rkey);
208				((struct mthca_raddr_seg *) wqe)->reserved = 0;
209				wqe += sizeof (struct mthca_raddr_seg);
210				size += sizeof (struct mthca_raddr_seg) / 16;
211				break;
212
213			default:
214				/* No extra segments required for sends */
215				break;
216			}
217
218			break;
219
220		case IBV_QPT_UD:
221			((struct mthca_tavor_ud_seg *) wqe)->lkey =
222				htonl(to_mah(wr->wr.ud.ah)->key);
223			((struct mthca_tavor_ud_seg *) wqe)->av_addr =
224				htonll((uintptr_t) to_mah(wr->wr.ud.ah)->av);
225			((struct mthca_tavor_ud_seg *) wqe)->dqpn =
226				htonl(wr->wr.ud.remote_qpn);
227			((struct mthca_tavor_ud_seg *) wqe)->qkey =
228				htonl(wr->wr.ud.remote_qkey);
229
230			wqe += sizeof (struct mthca_tavor_ud_seg);
231			size += sizeof (struct mthca_tavor_ud_seg) / 16;
232			break;
233
234		default:
235			break;
236		}
237
238		if (wr->num_sge > qp->sq.max_gs) {
239			ret = -1;
240			*bad_wr = wr;
241			goto out;
242		}
243
244		if (wr->send_flags & IBV_SEND_INLINE) {
245			if (wr->num_sge) {
246				struct mthca_inline_seg *seg = wqe;
247				int s = 0;
248
249				wqe += sizeof *seg;
250				for (i = 0; i < wr->num_sge; ++i) {
251					struct ibv_sge *sge = &wr->sg_list[i];
252
253					s += sge->length;
254
255					if (s > qp->max_inline_data) {
256						ret = -1;
257						*bad_wr = wr;
258						goto out;
259					}
260
261					memcpy(wqe, (void *) (intptr_t) sge->addr,
262					       sge->length);
263					wqe += sge->length;
264				}
265
266				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
267				size += align(s + sizeof *seg, 16) / 16;
268			}
269		} else {
270			struct mthca_data_seg *seg;
271
272			for (i = 0; i < wr->num_sge; ++i) {
273				seg = wqe;
274				seg->byte_count = htonl(wr->sg_list[i].length);
275				seg->lkey = htonl(wr->sg_list[i].lkey);
276				seg->addr = htonll(wr->sg_list[i].addr);
277				wqe += sizeof *seg;
278			}
279
280			size += wr->num_sge * (sizeof *seg / 16);
281		}
282
283		qp->wrid[ind + qp->rq.max] = wr->wr_id;
284
285		if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
286			ret = -1;
287			*bad_wr = wr;
288			goto out;
289		}
290
291		((struct mthca_next_seg *) prev_wqe)->nda_op =
292			htonl(((ind << qp->sq.wqe_shift) +
293			       qp->send_wqe_offset) |
294			      mthca_opcode[wr->opcode]);
295		/*
296		 * Make sure that nda_op is written before setting ee_nds.
297		 */
298		wmb();
299		((struct mthca_next_seg *) prev_wqe)->ee_nds =
300			htonl((size0 ? 0 : MTHCA_NEXT_DBD) | size |
301			((wr->send_flags & IBV_SEND_FENCE) ?
302			 MTHCA_NEXT_FENCE : 0));
303
304		if (!size0) {
305			size0 = size;
306			op0   = mthca_opcode[wr->opcode];
307			f0    = wr->send_flags & IBV_SEND_FENCE ?
308				MTHCA_SEND_DOORBELL_FENCE : 0;
309		}
310
311		++ind;
312		if (ind >= qp->sq.max)
313			ind -= qp->sq.max;
314	}
315
316out:
317	if (nreq) {
318		uint32_t doorbell[2];
319
320		doorbell[0] = htonl(((qp->sq.next_ind << qp->sq.wqe_shift) +
321				     qp->send_wqe_offset) | f0 | op0);
322		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
323
324		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
325	}
326
327	qp->sq.next_ind = ind;
328	qp->sq.head    += nreq;
329
330	pthread_spin_unlock(&qp->sq.lock);
331	return ret;
332}
333
334int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
335			  struct ibv_recv_wr **bad_wr)
336{
337	struct mthca_qp *qp = to_mqp(ibqp);
338	uint32_t doorbell[2];
339	int ret = 0;
340	int nreq;
341	int i;
342	int size;
343	int size0 = 0;
344	int ind;
345	void *wqe;
346	void *prev_wqe;
347
348	pthread_spin_lock(&qp->rq.lock);
349
350	ind = qp->rq.next_ind;
351
352	for (nreq = 0; wr; wr = wr->next) {
353		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
354			ret = -1;
355			*bad_wr = wr;
356			goto out;
357		}
358
359		wqe = get_recv_wqe(qp, ind);
360		prev_wqe = qp->rq.last;
361		qp->rq.last = wqe;
362
363		((struct mthca_next_seg *) wqe)->ee_nds =
364			htonl(MTHCA_NEXT_DBD);
365		((struct mthca_next_seg *) wqe)->flags =
366			htonl(MTHCA_NEXT_CQ_UPDATE);
367
368		wqe += sizeof (struct mthca_next_seg);
369		size = sizeof (struct mthca_next_seg) / 16;
370
371		if (wr->num_sge > qp->rq.max_gs) {
372			ret = -1;
373			*bad_wr = wr;
374			goto out;
375		}
376
377		for (i = 0; i < wr->num_sge; ++i) {
378			((struct mthca_data_seg *) wqe)->byte_count =
379				htonl(wr->sg_list[i].length);
380			((struct mthca_data_seg *) wqe)->lkey =
381				htonl(wr->sg_list[i].lkey);
382			((struct mthca_data_seg *) wqe)->addr =
383				htonll(wr->sg_list[i].addr);
384			wqe += sizeof (struct mthca_data_seg);
385			size += sizeof (struct mthca_data_seg) / 16;
386		}
387
388		qp->wrid[ind] = wr->wr_id;
389
390		((struct mthca_next_seg *) prev_wqe)->ee_nds =
391			htonl(MTHCA_NEXT_DBD | size);
392
393		if (!size0)
394			size0 = size;
395
396		++ind;
397		if (ind >= qp->rq.max)
398			ind -= qp->rq.max;
399
400		++nreq;
401		if (nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) {
402			nreq = 0;
403
404			doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
405			doorbell[1] = htonl(ibqp->qp_num << 8);
406
407			/*
408			 * Make sure that descriptors are written
409			 * before doorbell is rung.
410			 */
411			wmb();
412
413			mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
414
415			qp->rq.next_ind = ind;
416			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
417			size0 = 0;
418		}
419	}
420
421out:
422	if (nreq) {
423		doorbell[0] = htonl((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
424		doorbell[1] = htonl((ibqp->qp_num << 8) | nreq);
425
426		/*
427		 * Make sure that descriptors are written before
428		 * doorbell is rung.
429		 */
430		wmb();
431
432		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_RECV_DOORBELL);
433	}
434
435	qp->rq.next_ind = ind;
436	qp->rq.head    += nreq;
437
438	pthread_spin_unlock(&qp->rq.lock);
439	return ret;
440}
441
442int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
443			  struct ibv_send_wr **bad_wr)
444{
445	struct mthca_qp *qp = to_mqp(ibqp);
446	uint32_t doorbell[2];
447	void *wqe, *prev_wqe;
448	int ind;
449	int nreq;
450	int ret = 0;
451	int size;
452	int size0 = 0;
453	int i;
454	/*
455	 * f0 and op0 cannot be used unless nreq > 0, which means this
456	 * function makes it through the loop at least once.  So the
457	 * code inside the if (!size0) will be executed, and f0 and
458	 * op0 will be initialized.  So any gcc warning about "may be
459	 * used unitialized" is bogus.
460	 */
461	uint32_t f0;
462	uint32_t op0;
463
464	pthread_spin_lock(&qp->sq.lock);
465
466	/* XXX check that state is OK to post send */
467
468	ind = qp->sq.head & (qp->sq.max - 1);
469
470	for (nreq = 0; wr; ++nreq, wr = wr->next) {
471		if (nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB) {
472			nreq = 0;
473
474			doorbell[0] = htonl((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
475					    ((qp->sq.head & 0xffff) << 8) | f0 | op0);
476			doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
477
478			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
479
480			/*
481			 * Make sure that descriptors are written before
482			 * doorbell record.
483			 */
484			wmb();
485			*qp->sq.db = htonl(qp->sq.head & 0xffff);
486
487			/*
488			 * Make sure doorbell record is written before we
489			 * write MMIO send doorbell.
490			 */
491			wmb();
492			mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
493
494			size0 = 0;
495		}
496
497		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
498			ret = -1;
499			*bad_wr = wr;
500			goto out;
501		}
502
503		wqe = get_send_wqe(qp, ind);
504		prev_wqe = qp->sq.last;
505		qp->sq.last = wqe;
506
507		((struct mthca_next_seg *) wqe)->flags =
508			((wr->send_flags & IBV_SEND_SIGNALED) ?
509			 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
510			((wr->send_flags & IBV_SEND_SOLICITED) ?
511			 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
512			htonl(1);
513		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
514		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
515			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
516
517		wqe += sizeof (struct mthca_next_seg);
518		size = sizeof (struct mthca_next_seg) / 16;
519
520		switch (ibqp->qp_type) {
521		case IBV_QPT_RC:
522			switch (wr->opcode) {
523			case IBV_WR_ATOMIC_CMP_AND_SWP:
524			case IBV_WR_ATOMIC_FETCH_AND_ADD:
525				((struct mthca_raddr_seg *) wqe)->raddr =
526					htonll(wr->wr.atomic.remote_addr);
527				((struct mthca_raddr_seg *) wqe)->rkey =
528					htonl(wr->wr.atomic.rkey);
529				((struct mthca_raddr_seg *) wqe)->reserved = 0;
530
531				wqe += sizeof (struct mthca_raddr_seg);
532
533				if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
534					((struct mthca_atomic_seg *) wqe)->swap_add =
535						htonll(wr->wr.atomic.swap);
536					((struct mthca_atomic_seg *) wqe)->compare =
537						htonll(wr->wr.atomic.compare_add);
538				} else {
539					((struct mthca_atomic_seg *) wqe)->swap_add =
540						htonll(wr->wr.atomic.compare_add);
541					((struct mthca_atomic_seg *) wqe)->compare = 0;
542				}
543
544				wqe += sizeof (struct mthca_atomic_seg);
545				size += (sizeof (struct mthca_raddr_seg) +
546					 sizeof (struct mthca_atomic_seg)) / 16;
547				break;
548
549			case IBV_WR_RDMA_WRITE:
550			case IBV_WR_RDMA_WRITE_WITH_IMM:
551			case IBV_WR_RDMA_READ:
552				((struct mthca_raddr_seg *) wqe)->raddr =
553					htonll(wr->wr.rdma.remote_addr);
554				((struct mthca_raddr_seg *) wqe)->rkey =
555					htonl(wr->wr.rdma.rkey);
556				((struct mthca_raddr_seg *) wqe)->reserved = 0;
557				wqe += sizeof (struct mthca_raddr_seg);
558				size += sizeof (struct mthca_raddr_seg) / 16;
559				break;
560
561			default:
562				/* No extra segments required for sends */
563				break;
564			}
565
566			break;
567
568		case IBV_QPT_UC:
569			switch (wr->opcode) {
570			case IBV_WR_RDMA_WRITE:
571			case IBV_WR_RDMA_WRITE_WITH_IMM:
572				((struct mthca_raddr_seg *) wqe)->raddr =
573					htonll(wr->wr.rdma.remote_addr);
574				((struct mthca_raddr_seg *) wqe)->rkey =
575					htonl(wr->wr.rdma.rkey);
576				((struct mthca_raddr_seg *) wqe)->reserved = 0;
577				wqe += sizeof (struct mthca_raddr_seg);
578				size += sizeof (struct mthca_raddr_seg) / 16;
579				break;
580
581			default:
582				/* No extra segments required for sends */
583				break;
584			}
585
586			break;
587
588		case IBV_QPT_UD:
589			memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
590			       to_mah(wr->wr.ud.ah)->av, sizeof (struct mthca_av));
591			((struct mthca_arbel_ud_seg *) wqe)->dqpn =
592				htonl(wr->wr.ud.remote_qpn);
593			((struct mthca_arbel_ud_seg *) wqe)->qkey =
594				htonl(wr->wr.ud.remote_qkey);
595
596			wqe += sizeof (struct mthca_arbel_ud_seg);
597			size += sizeof (struct mthca_arbel_ud_seg) / 16;
598			break;
599
600		default:
601			break;
602		}
603
604		if (wr->num_sge > qp->sq.max_gs) {
605			ret = -1;
606			*bad_wr = wr;
607			goto out;
608		}
609
610		if (wr->send_flags & IBV_SEND_INLINE) {
611			if (wr->num_sge) {
612				struct mthca_inline_seg *seg = wqe;
613				int s = 0;
614
615				wqe += sizeof *seg;
616				for (i = 0; i < wr->num_sge; ++i) {
617					struct ibv_sge *sge = &wr->sg_list[i];
618
619					s += sge->length;
620
621					if (s > qp->max_inline_data) {
622						ret = -1;
623						*bad_wr = wr;
624						goto out;
625					}
626
627					memcpy(wqe, (void *) (uintptr_t) sge->addr,
628					       sge->length);
629					wqe += sge->length;
630				}
631
632				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
633				size += align(s + sizeof *seg, 16) / 16;
634			}
635		} else {
636			struct mthca_data_seg *seg;
637
638			for (i = 0; i < wr->num_sge; ++i) {
639				seg = wqe;
640				seg->byte_count = htonl(wr->sg_list[i].length);
641				seg->lkey = htonl(wr->sg_list[i].lkey);
642				seg->addr = htonll(wr->sg_list[i].addr);
643				wqe += sizeof *seg;
644			}
645
646			size += wr->num_sge * (sizeof *seg / 16);
647		}
648
649		qp->wrid[ind + qp->rq.max] = wr->wr_id;
650
651		if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) {
652			ret = -1;
653			*bad_wr = wr;
654			goto out;
655		}
656
657		((struct mthca_next_seg *) prev_wqe)->nda_op =
658			htonl(((ind << qp->sq.wqe_shift) +
659			       qp->send_wqe_offset) |
660			      mthca_opcode[wr->opcode]);
661		wmb();
662		((struct mthca_next_seg *) prev_wqe)->ee_nds =
663			htonl(MTHCA_NEXT_DBD | size |
664			      ((wr->send_flags & IBV_SEND_FENCE) ?
665			       MTHCA_NEXT_FENCE : 0));
666
667		if (!size0) {
668			size0 = size;
669			op0   = mthca_opcode[wr->opcode];
670			f0    = wr->send_flags & IBV_SEND_FENCE ?
671				MTHCA_SEND_DOORBELL_FENCE : 0;
672		}
673
674		++ind;
675		if (ind >= qp->sq.max)
676			ind -= qp->sq.max;
677	}
678
679out:
680	if (nreq) {
681		doorbell[0] = htonl((nreq << 24)                  |
682				    ((qp->sq.head & 0xffff) << 8) |
683				    f0 | op0);
684		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
685
686		qp->sq.head += nreq;
687
688		/*
689		 * Make sure that descriptors are written before
690		 * doorbell record.
691		 */
692		wmb();
693		*qp->sq.db = htonl(qp->sq.head & 0xffff);
694
695		/*
696		 * Make sure doorbell record is written before we
697		 * write MMIO send doorbell.
698		 */
699		wmb();
700		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
701	}
702
703	pthread_spin_unlock(&qp->sq.lock);
704	return ret;
705}
706
707int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
708			  struct ibv_recv_wr **bad_wr)
709{
710	struct mthca_qp *qp = to_mqp(ibqp);
711	int ret = 0;
712	int nreq;
713	int ind;
714	int i;
715	void *wqe;
716
717	pthread_spin_lock(&qp->rq.lock);
718
719	/* XXX check that state is OK to post receive */
720
721	ind = qp->rq.head & (qp->rq.max - 1);
722
723	for (nreq = 0; wr; ++nreq, wr = wr->next) {
724		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
725			ret = -1;
726			*bad_wr = wr;
727			goto out;
728		}
729
730		wqe = get_recv_wqe(qp, ind);
731
732		((struct mthca_next_seg *) wqe)->flags = 0;
733
734		wqe += sizeof (struct mthca_next_seg);
735
736		if (wr->num_sge > qp->rq.max_gs) {
737			ret = -1;
738			*bad_wr = wr;
739			goto out;
740		}
741
742		for (i = 0; i < wr->num_sge; ++i) {
743			((struct mthca_data_seg *) wqe)->byte_count =
744				htonl(wr->sg_list[i].length);
745			((struct mthca_data_seg *) wqe)->lkey =
746				htonl(wr->sg_list[i].lkey);
747			((struct mthca_data_seg *) wqe)->addr =
748				htonll(wr->sg_list[i].addr);
749			wqe += sizeof (struct mthca_data_seg);
750		}
751
752		if (i < qp->rq.max_gs) {
753			((struct mthca_data_seg *) wqe)->byte_count = 0;
754			((struct mthca_data_seg *) wqe)->lkey = htonl(MTHCA_INVAL_LKEY);
755			((struct mthca_data_seg *) wqe)->addr = 0;
756		}
757
758		qp->wrid[ind] = wr->wr_id;
759
760		++ind;
761		if (ind >= qp->rq.max)
762			ind -= qp->rq.max;
763	}
764out:
765	if (nreq) {
766		qp->rq.head += nreq;
767
768		/*
769		 * Make sure that descriptors are written before
770		 * doorbell record.
771		 */
772		wmb();
773		*qp->rq.db = htonl(qp->rq.head & 0xffff);
774	}
775
776	pthread_spin_unlock(&qp->rq.lock);
777	return ret;
778}
779
780int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
781		       enum ibv_qp_type type, struct mthca_qp *qp)
782{
783	int size;
784	int max_sq_sge;
785	struct mthca_next_seg *next;
786	int i;
787
788	qp->rq.max_gs 	 = cap->max_recv_sge;
789	qp->sq.max_gs 	 = cap->max_send_sge;
790	max_sq_sge 	 = align(cap->max_inline_data + sizeof (struct mthca_inline_seg),
791				 sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg);
792	if (max_sq_sge < cap->max_send_sge)
793		max_sq_sge = cap->max_send_sge;
794
795	qp->wrid = malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t));
796	if (!qp->wrid)
797		return -1;
798
799	size = sizeof (struct mthca_next_seg) +
800		qp->rq.max_gs * sizeof (struct mthca_data_seg);
801
802	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
803	     qp->rq.wqe_shift++)
804		; /* nothing */
805
806	size = max_sq_sge * sizeof (struct mthca_data_seg);
807	switch (type) {
808	case IBV_QPT_UD:
809		size += mthca_is_memfree(pd->context) ?
810			sizeof (struct mthca_arbel_ud_seg) :
811			sizeof (struct mthca_tavor_ud_seg);
812		break;
813
814	case IBV_QPT_UC:
815		size += sizeof (struct mthca_raddr_seg);
816		break;
817
818	case IBV_QPT_RC:
819		size += sizeof (struct mthca_raddr_seg);
820		/*
821		 * An atomic op will require an atomic segment, a
822		 * remote address segment and one scatter entry.
823		 */
824		if (size < (sizeof (struct mthca_atomic_seg) +
825			    sizeof (struct mthca_raddr_seg) +
826			    sizeof (struct mthca_data_seg)))
827			size = (sizeof (struct mthca_atomic_seg) +
828				sizeof (struct mthca_raddr_seg) +
829				sizeof (struct mthca_data_seg));
830		break;
831
832	default:
833		break;
834	}
835
836	/* Make sure that we have enough space for a bind request */
837	if (size < sizeof (struct mthca_bind_seg))
838		size = sizeof (struct mthca_bind_seg);
839
840	size += sizeof (struct mthca_next_seg);
841
842	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
843	     qp->sq.wqe_shift++)
844		; /* nothing */
845
846	qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift,
847				    1 << qp->sq.wqe_shift);
848
849	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
850
851	if (mthca_alloc_buf(&qp->buf,
852			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
853			    to_mdev(pd->context->device)->page_size)) {
854		free(qp->wrid);
855		return -1;
856	}
857
858	memset(qp->buf.buf, 0, qp->buf_size);
859
860	if (mthca_is_memfree(pd->context)) {
861		struct mthca_data_seg *scatter;
862		uint32_t sz;
863
864		sz = htonl((sizeof (struct mthca_next_seg) +
865			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16);
866
867		for (i = 0; i < qp->rq.max; ++i) {
868			next = get_recv_wqe(qp, i);
869			next->nda_op = htonl(((i + 1) & (qp->rq.max - 1)) <<
870					     qp->rq.wqe_shift);
871			next->ee_nds = sz;
872
873			for (scatter = (void *) (next + 1);
874			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
875			     ++scatter)
876				scatter->lkey = htonl(MTHCA_INVAL_LKEY);
877		}
878
879		for (i = 0; i < qp->sq.max; ++i) {
880			next = get_send_wqe(qp, i);
881			next->nda_op = htonl((((i + 1) & (qp->sq.max - 1)) <<
882					      qp->sq.wqe_shift) +
883					     qp->send_wqe_offset);
884		}
885	} else {
886		for (i = 0; i < qp->rq.max; ++i) {
887			next = get_recv_wqe(qp, i);
888			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
889					     qp->rq.wqe_shift) | 1);
890		}
891	}
892
893	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
894	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
895
896	return 0;
897}
898
899struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn)
900{
901	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
902
903	if (ctx->qp_table[tind].refcnt)
904		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
905	else
906		return NULL;
907}
908
909int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp)
910{
911	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
912
913	if (!ctx->qp_table[tind].refcnt) {
914		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
915						   sizeof (struct mthca_qp *));
916		if (!ctx->qp_table[tind].table)
917			return -1;
918	}
919
920	++ctx->qp_table[tind].refcnt;
921	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
922	return 0;
923}
924
925void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn)
926{
927	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
928
929	if (!--ctx->qp_table[tind].refcnt)
930		free(ctx->qp_table[tind].table);
931	else
932		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
933}
934
935int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
936		       int index, int *dbd, uint32_t *new_wqe)
937{
938	struct mthca_next_seg *next;
939
940	/*
941	 * For SRQs, all receive WQEs generate a CQE, so we're always
942	 * at the end of the doorbell chain.
943	 */
944	if (qp->ibv_qp.srq && !is_send) {
945		*new_wqe = 0;
946		return 0;
947	}
948
949	if (is_send)
950		next = get_send_wqe(qp, index);
951	else
952		next = get_recv_wqe(qp, index);
953
954	*dbd = !!(next->ee_nds & htonl(MTHCA_NEXT_DBD));
955	if (next->ee_nds & htonl(0x3f))
956		*new_wqe = (next->nda_op & htonl(~0x3f)) |
957			(next->ee_nds & htonl(0x3f));
958	else
959		*new_wqe = 0;
960
961	return 0;
962}
963
964