1219820Sjeff/*
2219820Sjeff * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3219820Sjeff * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4219820Sjeff * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
5219820Sjeff *
6219820Sjeff * This software is available to you under a choice of one of two
7219820Sjeff * licenses.  You may choose to be licensed under the terms of the GNU
8219820Sjeff * General Public License (GPL) Version 2, available from the file
9219820Sjeff * COPYING in the main directory of this source tree, or the
10219820Sjeff * OpenIB.org BSD license below:
11219820Sjeff *
12219820Sjeff *     Redistribution and use in source and binary forms, with or
13219820Sjeff *     without modification, are permitted provided that the following
14219820Sjeff *     conditions are met:
15219820Sjeff *
16219820Sjeff *      - Redistributions of source code must retain the above
17219820Sjeff *        copyright notice, this list of conditions and the following
18219820Sjeff *        disclaimer.
19219820Sjeff *
20219820Sjeff *      - Redistributions in binary form must reproduce the above
21219820Sjeff *        copyright notice, this list of conditions and the following
22219820Sjeff *        disclaimer in the documentation and/or other materials
23219820Sjeff *        provided with the distribution.
24219820Sjeff *
25219820Sjeff * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26219820Sjeff * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27219820Sjeff * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28219820Sjeff * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29219820Sjeff * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30219820Sjeff * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31219820Sjeff * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32219820Sjeff * SOFTWARE.
33219820Sjeff */
34219820Sjeff
35219820Sjeff#if HAVE_CONFIG_H
36219820Sjeff#  include <config.h>
37219820Sjeff#endif /* HAVE_CONFIG_H */
38219820Sjeff
39219820Sjeff#include <stdlib.h>
40219820Sjeff#include <netinet/in.h>
41219820Sjeff#include <pthread.h>
42219820Sjeff#include <string.h>
43219820Sjeff
44219820Sjeff#include "mlx4.h"
45219820Sjeff#include "doorbell.h"
46219820Sjeff#include "wqe.h"
47219820Sjeff
48219820Sjeffstatic const uint32_t mlx4_ib_opcode[] = {
49219820Sjeff	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
50219820Sjeff	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
51219820Sjeff	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
52219820Sjeff	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
53219820Sjeff	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
54219820Sjeff	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
55219820Sjeff	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
56219820Sjeff};
57219820Sjeff
58219820Sjeffstatic void *get_recv_wqe(struct mlx4_qp *qp, int n)
59219820Sjeff{
60219820Sjeff	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
61219820Sjeff}
62219820Sjeff
63219820Sjeffstatic void *get_send_wqe(struct mlx4_qp *qp, int n)
64219820Sjeff{
65219820Sjeff	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
66219820Sjeff}
67219820Sjeff
68219820Sjeff/*
69219820Sjeff * Stamp a SQ WQE so that it is invalid if prefetched by marking the
70219820Sjeff * first four bytes of every 64 byte chunk with 0xffffffff, except for
71219820Sjeff * the very first chunk of the WQE.
72219820Sjeff */
73219820Sjeffstatic void stamp_send_wqe(struct mlx4_qp *qp, int n)
74219820Sjeff{
75219820Sjeff	uint32_t *wqe = get_send_wqe(qp, n);
76219820Sjeff	int i;
77219820Sjeff	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
78219820Sjeff
79219820Sjeff	for (i = 16; i < ds; i += 16)
80219820Sjeff		wqe[i] = 0xffffffff;
81219820Sjeff}
82219820Sjeff
83219820Sjeffvoid mlx4_init_qp_indices(struct mlx4_qp *qp)
84219820Sjeff{
85219820Sjeff	qp->sq.head	 = 0;
86219820Sjeff	qp->sq.tail	 = 0;
87219820Sjeff	qp->rq.head	 = 0;
88219820Sjeff	qp->rq.tail	 = 0;
89219820Sjeff}
90219820Sjeff
91219820Sjeffvoid mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
92219820Sjeff{
93219820Sjeff	struct mlx4_wqe_ctrl_seg *ctrl;
94219820Sjeff	int i;
95219820Sjeff
96219820Sjeff	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
97219820Sjeff		ctrl = get_send_wqe(qp, i);
98219820Sjeff		ctrl->owner_opcode = htonl(1 << 31);
99219820Sjeff		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
100219820Sjeff
101219820Sjeff		stamp_send_wqe(qp, i);
102219820Sjeff	}
103219820Sjeff}
104219820Sjeff
105219820Sjeffstatic int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
106219820Sjeff{
107219820Sjeff	unsigned cur;
108219820Sjeff
109219820Sjeff	cur = wq->head - wq->tail;
110219820Sjeff	if (cur + nreq < wq->max_post)
111219820Sjeff		return 0;
112219820Sjeff
113219820Sjeff	pthread_spin_lock(&cq->lock);
114219820Sjeff	cur = wq->head - wq->tail;
115219820Sjeff	pthread_spin_unlock(&cq->lock);
116219820Sjeff
117219820Sjeff	return cur + nreq >= wq->max_post;
118219820Sjeff}
119219820Sjeff
120219820Sjeffstatic inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
121219820Sjeff				 uint64_t remote_addr, uint32_t rkey)
122219820Sjeff{
123219820Sjeff	rseg->raddr    = htonll(remote_addr);
124219820Sjeff	rseg->rkey     = htonl(rkey);
125219820Sjeff	rseg->reserved = 0;
126219820Sjeff}
127219820Sjeff
128219820Sjeffstatic void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
129219820Sjeff{
130219820Sjeff	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
131219820Sjeff		aseg->swap_add = htonll(wr->wr.atomic.swap);
132219820Sjeff		aseg->compare  = htonll(wr->wr.atomic.compare_add);
133219820Sjeff	} else {
134219820Sjeff		aseg->swap_add = htonll(wr->wr.atomic.compare_add);
135219820Sjeff		aseg->compare  = 0;
136219820Sjeff	}
137219820Sjeff
138219820Sjeff}
139219820Sjeff
140219820Sjeffstatic void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
141219820Sjeff			     struct ibv_send_wr *wr)
142219820Sjeff{
143219820Sjeff	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
144219820Sjeff	dseg->dqpn = htonl(wr->wr.ud.remote_qpn);
145219820Sjeff	dseg->qkey = htonl(wr->wr.ud.remote_qkey);
146219820Sjeff	dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan);
147219820Sjeff	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
148219820Sjeff}
149219820Sjeff
150219820Sjeffstatic void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
151219820Sjeff{
152219820Sjeff	dseg->byte_count = htonl(sg->length);
153219820Sjeff	dseg->lkey       = htonl(sg->lkey);
154219820Sjeff	dseg->addr       = htonll(sg->addr);
155219820Sjeff}
156219820Sjeff
157219820Sjeffstatic void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
158219820Sjeff{
159219820Sjeff	dseg->lkey       = htonl(sg->lkey);
160219820Sjeff	dseg->addr       = htonll(sg->addr);
161219820Sjeff
162219820Sjeff	/*
163219820Sjeff	 * Need a barrier here before writing the byte_count field to
164219820Sjeff	 * make sure that all the data is visible before the
165219820Sjeff	 * byte_count field is set.  Otherwise, if the segment begins
166219820Sjeff	 * a new cacheline, the HCA prefetcher could grab the 64-byte
167219820Sjeff	 * chunk and get a valid (!= * 0xffffffff) byte count but
168219820Sjeff	 * stale data, and end up sending the wrong data.
169219820Sjeff	 */
170219820Sjeff	wmb();
171219820Sjeff
172219820Sjeff	dseg->byte_count = htonl(sg->length);
173219820Sjeff}
174219820Sjeff
175219820Sjeff/*
176219820Sjeff * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
177219820Sjeff * implementations may use move-string-buffer assembler instructions,
178219820Sjeff * which do not guarantee order of copying.
179219820Sjeff */
180219820Sjeffstatic void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
181219820Sjeff{
182219820Sjeff	while (bytecnt > 0) {
183219820Sjeff		*dst++ = *src++;
184219820Sjeff		*dst++ = *src++;
185219820Sjeff		bytecnt -= 2 * sizeof (long);
186219820Sjeff	}
187219820Sjeff}
188219820Sjeff
189219820Sjeffint mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
190219820Sjeff			  struct ibv_send_wr **bad_wr)
191219820Sjeff{
192219820Sjeff	struct mlx4_context *ctx;
193219820Sjeff	struct mlx4_qp *qp = to_mqp(ibqp);
194219820Sjeff	void *wqe;
195219820Sjeff	struct mlx4_wqe_ctrl_seg *ctrl;
196219820Sjeff	int ind;
197219820Sjeff	int nreq;
198219820Sjeff	int inl = 0;
199219820Sjeff	int ret = 0;
200219820Sjeff	int size;
201219820Sjeff	int i;
202219820Sjeff
203219820Sjeff	pthread_spin_lock(&qp->sq.lock);
204219820Sjeff
205219820Sjeff	/* XXX check that state is OK to post send */
206219820Sjeff
207219820Sjeff	ind = qp->sq.head;
208219820Sjeff
209219820Sjeff	for (nreq = 0; wr; ++nreq, wr = wr->next) {
210219820Sjeff		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
211219820Sjeff			ret = -1;
212219820Sjeff			*bad_wr = wr;
213219820Sjeff			goto out;
214219820Sjeff		}
215219820Sjeff
216219820Sjeff		if (wr->num_sge > qp->sq.max_gs) {
217219820Sjeff			ret = -1;
218219820Sjeff			*bad_wr = wr;
219219820Sjeff			goto out;
220219820Sjeff		}
221219820Sjeff
222219820Sjeff		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
223219820Sjeff			ret = -1;
224219820Sjeff			*bad_wr = wr;
225219820Sjeff			goto out;
226219820Sjeff		}
227219820Sjeff
228219820Sjeff		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
229219820Sjeff		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
230219820Sjeff
231219820Sjeff		ctrl->xrcrb_flags =
232219820Sjeff			(wr->send_flags & IBV_SEND_SIGNALED ?
233219820Sjeff			 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
234219820Sjeff			(wr->send_flags & IBV_SEND_SOLICITED ?
235219820Sjeff			 htonl(MLX4_WQE_CTRL_SOLICIT) : 0)   |
236219820Sjeff			qp->sq_signal_bits;
237219820Sjeff
238219820Sjeff		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
239219820Sjeff		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
240219820Sjeff			ctrl->imm = wr->imm_data;
241219820Sjeff		else
242219820Sjeff			ctrl->imm = 0;
243219820Sjeff
244219820Sjeff		wqe += sizeof *ctrl;
245219820Sjeff		size = sizeof *ctrl / 16;
246219820Sjeff
247219820Sjeff		switch (ibqp->qp_type) {
248219820Sjeff		case IBV_QPT_XRC:
249219820Sjeff			ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
250219820Sjeff			/* fall thru */
251219820Sjeff		case IBV_QPT_RC:
252219820Sjeff		case IBV_QPT_UC:
253219820Sjeff			switch (wr->opcode) {
254219820Sjeff			case IBV_WR_ATOMIC_CMP_AND_SWP:
255219820Sjeff			case IBV_WR_ATOMIC_FETCH_AND_ADD:
256219820Sjeff				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
257219820Sjeff					      wr->wr.atomic.rkey);
258219820Sjeff				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
259219820Sjeff
260219820Sjeff				set_atomic_seg(wqe, wr);
261219820Sjeff				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
262219820Sjeff				size += (sizeof (struct mlx4_wqe_raddr_seg) +
263219820Sjeff					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
264219820Sjeff
265219820Sjeff				break;
266219820Sjeff
267219820Sjeff			case IBV_WR_RDMA_READ:
268219820Sjeff				inl = 1;
269219820Sjeff				/* fall through */
270219820Sjeff			case IBV_WR_RDMA_WRITE:
271219820Sjeff			case IBV_WR_RDMA_WRITE_WITH_IMM:
272219820Sjeff				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
273219820Sjeff					      wr->wr.rdma.rkey);
274219820Sjeff				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
275219820Sjeff				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
276219820Sjeff
277219820Sjeff				break;
278219820Sjeff
279219820Sjeff			default:
280219820Sjeff				/* No extra segments required for sends */
281219820Sjeff				break;
282219820Sjeff			}
283219820Sjeff			break;
284219820Sjeff
285219820Sjeff		case IBV_QPT_UD:
286219820Sjeff			set_datagram_seg(wqe, wr);
287219820Sjeff			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
288219820Sjeff			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
289219820Sjeff			if (to_mah(wr->wr.ud.ah)->tagged) {
290219820Sjeff				ctrl->ins_vlan = 1 << 6;
291219820Sjeff				ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
292219820Sjeff			}
293219820Sjeff
294219820Sjeff			break;
295219820Sjeff
296219820Sjeff		default:
297219820Sjeff			break;
298219820Sjeff		}
299219820Sjeff
300219820Sjeff		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
301219820Sjeff			struct mlx4_wqe_inline_seg *seg;
302219820Sjeff			void *addr;
303219820Sjeff			int len, seg_len;
304219820Sjeff			int num_seg;
305219820Sjeff			int off, to_copy;
306219820Sjeff
307219820Sjeff			inl = 0;
308219820Sjeff
309219820Sjeff			seg = wqe;
310219820Sjeff			wqe += sizeof *seg;
311219820Sjeff			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
312219820Sjeff			num_seg = 0;
313219820Sjeff			seg_len = 0;
314219820Sjeff
315219820Sjeff			for (i = 0; i < wr->num_sge; ++i) {
316219820Sjeff				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
317219820Sjeff				len  = wr->sg_list[i].length;
318219820Sjeff				inl += len;
319219820Sjeff
320219820Sjeff				if (inl > qp->max_inline_data) {
321219820Sjeff					inl = 0;
322219820Sjeff					ret = -1;
323219820Sjeff					*bad_wr = wr;
324219820Sjeff					goto out;
325219820Sjeff				}
326219820Sjeff
327219820Sjeff				while (len >= MLX4_INLINE_ALIGN - off) {
328219820Sjeff					to_copy = MLX4_INLINE_ALIGN - off;
329219820Sjeff					memcpy(wqe, addr, to_copy);
330219820Sjeff					len -= to_copy;
331219820Sjeff					wqe += to_copy;
332219820Sjeff					addr += to_copy;
333219820Sjeff					seg_len += to_copy;
334219820Sjeff					wmb(); /* see comment below */
335219820Sjeff					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
336219820Sjeff					seg_len = 0;
337219820Sjeff					seg = wqe;
338219820Sjeff					wqe += sizeof *seg;
339219820Sjeff					off = sizeof *seg;
340219820Sjeff					++num_seg;
341219820Sjeff				}
342219820Sjeff
343219820Sjeff				memcpy(wqe, addr, len);
344219820Sjeff				wqe += len;
345219820Sjeff				seg_len += len;
346219820Sjeff				off += len;
347219820Sjeff			}
348219820Sjeff
349219820Sjeff			if (seg_len) {
350219820Sjeff				++num_seg;
351219820Sjeff				/*
352219820Sjeff				 * Need a barrier here to make sure
353219820Sjeff				 * all the data is visible before the
354219820Sjeff				 * byte_count field is set.  Otherwise
355219820Sjeff				 * the HCA prefetcher could grab the
356219820Sjeff				 * 64-byte chunk with this inline
357219820Sjeff				 * segment and get a valid (!=
358219820Sjeff				 * 0xffffffff) byte count but stale
359219820Sjeff				 * data, and end up sending the wrong
360219820Sjeff				 * data.
361219820Sjeff				 */
362219820Sjeff				wmb();
363219820Sjeff				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
364219820Sjeff			}
365219820Sjeff
366219820Sjeff			size += (inl + num_seg * sizeof * seg + 15) / 16;
367219820Sjeff		} else {
368219820Sjeff			struct mlx4_wqe_data_seg *seg = wqe;
369219820Sjeff
370219820Sjeff			for (i = wr->num_sge - 1; i >= 0 ; --i)
371219820Sjeff				set_data_seg(seg + i, wr->sg_list + i);
372219820Sjeff
373219820Sjeff			size += wr->num_sge * (sizeof *seg / 16);
374219820Sjeff		}
375219820Sjeff
376219820Sjeff		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
377219820Sjeff				    MLX4_WQE_CTRL_FENCE : 0) | size;
378219820Sjeff
379219820Sjeff		/*
380219820Sjeff		 * Make sure descriptor is fully written before
381219820Sjeff		 * setting ownership bit (because HW can start
382219820Sjeff		 * executing as soon as we do).
383219820Sjeff		 */
384219820Sjeff		wmb();
385219820Sjeff
386219820Sjeff		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
387219820Sjeff			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
388219820Sjeff
389219820Sjeff		/*
390219820Sjeff		 * We can improve latency by not stamping the last
391219820Sjeff		 * send queue WQE until after ringing the doorbell, so
392219820Sjeff		 * only stamp here if there are still more WQEs to post.
393219820Sjeff		 */
394219820Sjeff		if (wr->next)
395219820Sjeff			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
396219820Sjeff				       (qp->sq.wqe_cnt - 1));
397219820Sjeff
398219820Sjeff		++ind;
399219820Sjeff	}
400219820Sjeff
401219820Sjeffout:
402219820Sjeff	ctx = to_mctx(ibqp->context);
403219820Sjeff
404219820Sjeff	if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
405219820Sjeff		ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
406219820Sjeff		*(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
407219820Sjeff		/*
408219820Sjeff		 * Make sure that descriptor is written to memory
409219820Sjeff		 * before writing to BlueFlame page.
410219820Sjeff		 */
411219820Sjeff		wmb();
412219820Sjeff
413219820Sjeff		++qp->sq.head;
414219820Sjeff
415219820Sjeff		pthread_spin_lock(&ctx->bf_lock);
416219820Sjeff
417219820Sjeff		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
418219820Sjeff			     align(size * 16, 64));
419219820Sjeff		wc_wmb();
420219820Sjeff
421219820Sjeff		ctx->bf_offset ^= ctx->bf_buf_size;
422219820Sjeff
423219820Sjeff		pthread_spin_unlock(&ctx->bf_lock);
424219820Sjeff	} else if (nreq) {
425219820Sjeff		qp->sq.head += nreq;
426219820Sjeff
427219820Sjeff		/*
428219820Sjeff		 * Make sure that descriptors are written before
429219820Sjeff		 * doorbell record.
430219820Sjeff		 */
431219820Sjeff		wmb();
432219820Sjeff
433219820Sjeff		*(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
434219820Sjeff	}
435219820Sjeff
436219820Sjeff	if (nreq)
437219820Sjeff		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
438219820Sjeff			       (qp->sq.wqe_cnt - 1));
439219820Sjeff
440219820Sjeff	pthread_spin_unlock(&qp->sq.lock);
441219820Sjeff
442219820Sjeff	return ret;
443219820Sjeff}
444219820Sjeff
445219820Sjeffint mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
446219820Sjeff		   struct ibv_recv_wr **bad_wr)
447219820Sjeff{
448219820Sjeff	struct mlx4_qp *qp = to_mqp(ibqp);
449219820Sjeff	struct mlx4_wqe_data_seg *scat;
450219820Sjeff	int ret = 0;
451219820Sjeff	int nreq;
452219820Sjeff	int ind;
453219820Sjeff	int i;
454219820Sjeff
455219820Sjeff	pthread_spin_lock(&qp->rq.lock);
456219820Sjeff
457219820Sjeff	/* XXX check that state is OK to post receive */
458219820Sjeff
459219820Sjeff	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
460219820Sjeff
461219820Sjeff	for (nreq = 0; wr; ++nreq, wr = wr->next) {
462219820Sjeff		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
463219820Sjeff			ret = -1;
464219820Sjeff			*bad_wr = wr;
465219820Sjeff			goto out;
466219820Sjeff		}
467219820Sjeff
468219820Sjeff		if (wr->num_sge > qp->rq.max_gs) {
469219820Sjeff			ret = -1;
470219820Sjeff			*bad_wr = wr;
471219820Sjeff			goto out;
472219820Sjeff		}
473219820Sjeff
474219820Sjeff		scat = get_recv_wqe(qp, ind);
475219820Sjeff
476219820Sjeff		for (i = 0; i < wr->num_sge; ++i)
477219820Sjeff			__set_data_seg(scat + i, wr->sg_list + i);
478219820Sjeff
479219820Sjeff		if (i < qp->rq.max_gs) {
480219820Sjeff			scat[i].byte_count = 0;
481219820Sjeff			scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
482219820Sjeff			scat[i].addr       = 0;
483219820Sjeff		}
484219820Sjeff
485219820Sjeff		qp->rq.wrid[ind] = wr->wr_id;
486219820Sjeff
487219820Sjeff		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
488219820Sjeff	}
489219820Sjeff
490219820Sjeffout:
491219820Sjeff	if (nreq) {
492219820Sjeff		qp->rq.head += nreq;
493219820Sjeff
494219820Sjeff		/*
495219820Sjeff		 * Make sure that descriptors are written before
496219820Sjeff		 * doorbell record.
497219820Sjeff		 */
498219820Sjeff		wmb();
499219820Sjeff
500219820Sjeff		*qp->db = htonl(qp->rq.head & 0xffff);
501219820Sjeff	}
502219820Sjeff
503219820Sjeff	pthread_spin_unlock(&qp->rq.lock);
504219820Sjeff
505219820Sjeff	return ret;
506219820Sjeff}
507219820Sjeff
508219820Sjeffint num_inline_segs(int data, enum ibv_qp_type type)
509219820Sjeff{
510219820Sjeff	/*
511219820Sjeff	 * Inline data segments are not allowed to cross 64 byte
512219820Sjeff	 * boundaries.  For UD QPs, the data segments always start
513219820Sjeff	 * aligned to 64 bytes (16 byte control segment + 48 byte
514219820Sjeff	 * datagram segment); for other QPs, there will be a 16 byte
515219820Sjeff	 * control segment and possibly a 16 byte remote address
516219820Sjeff	 * segment, so in the worst case there will be only 32 bytes
517219820Sjeff	 * available for the first data segment.
518219820Sjeff	 */
519219820Sjeff	if (type == IBV_QPT_UD)
520219820Sjeff		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
521219820Sjeff			 sizeof (struct mlx4_wqe_datagram_seg)) %
522219820Sjeff			MLX4_INLINE_ALIGN;
523219820Sjeff	else
524219820Sjeff		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
525219820Sjeff			 sizeof (struct mlx4_wqe_raddr_seg)) %
526219820Sjeff			MLX4_INLINE_ALIGN;
527219820Sjeff
528219820Sjeff	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
529219820Sjeff		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
530219820Sjeff}
531219820Sjeff
532219820Sjeffvoid mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
533219820Sjeff			   struct mlx4_qp *qp)
534219820Sjeff{
535219820Sjeff	int size;
536219820Sjeff	int max_sq_sge;
537219820Sjeff
538219820Sjeff	max_sq_sge	 = align(cap->max_inline_data +
539219820Sjeff				 num_inline_segs(cap->max_inline_data, type) *
540219820Sjeff				 sizeof (struct mlx4_wqe_inline_seg),
541219820Sjeff				 sizeof (struct mlx4_wqe_data_seg)) /
542219820Sjeff		sizeof (struct mlx4_wqe_data_seg);
543219820Sjeff	if (max_sq_sge < cap->max_send_sge)
544219820Sjeff		max_sq_sge = cap->max_send_sge;
545219820Sjeff
546219820Sjeff	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
547219820Sjeff	switch (type) {
548219820Sjeff	case IBV_QPT_UD:
549219820Sjeff		size += sizeof (struct mlx4_wqe_datagram_seg);
550219820Sjeff		break;
551219820Sjeff
552219820Sjeff	case IBV_QPT_UC:
553219820Sjeff		size += sizeof (struct mlx4_wqe_raddr_seg);
554219820Sjeff		break;
555219820Sjeff
556219820Sjeff	case IBV_QPT_XRC:
557219820Sjeff	case IBV_QPT_RC:
558219820Sjeff		size += sizeof (struct mlx4_wqe_raddr_seg);
559219820Sjeff		/*
560219820Sjeff		 * An atomic op will require an atomic segment, a
561219820Sjeff		 * remote address segment and one scatter entry.
562219820Sjeff		 */
563219820Sjeff		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
564219820Sjeff			    sizeof (struct mlx4_wqe_raddr_seg) +
565219820Sjeff			    sizeof (struct mlx4_wqe_data_seg)))
566219820Sjeff			size = (sizeof (struct mlx4_wqe_atomic_seg) +
567219820Sjeff				sizeof (struct mlx4_wqe_raddr_seg) +
568219820Sjeff				sizeof (struct mlx4_wqe_data_seg));
569219820Sjeff		break;
570219820Sjeff
571219820Sjeff	default:
572219820Sjeff		break;
573219820Sjeff	}
574219820Sjeff
575219820Sjeff	/* Make sure that we have enough space for a bind request */
576219820Sjeff	if (size < sizeof (struct mlx4_wqe_bind_seg))
577219820Sjeff		size = sizeof (struct mlx4_wqe_bind_seg);
578219820Sjeff
579219820Sjeff	size += sizeof (struct mlx4_wqe_ctrl_seg);
580219820Sjeff
581219820Sjeff	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
582219820Sjeff	     qp->sq.wqe_shift++)
583219820Sjeff		; /* nothing */
584219820Sjeff}
585219820Sjeff
586219820Sjeffint mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
587219820Sjeff		       enum ibv_qp_type type, struct mlx4_qp *qp)
588219820Sjeff{
589219820Sjeff	qp->rq.max_gs	 = cap->max_recv_sge;
590219820Sjeff
591219820Sjeff	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
592219820Sjeff	if (!qp->sq.wrid)
593219820Sjeff		return -1;
594219820Sjeff
595219820Sjeff	if (qp->rq.wqe_cnt) {
596219820Sjeff		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
597219820Sjeff		if (!qp->rq.wrid) {
598219820Sjeff			free(qp->sq.wrid);
599219820Sjeff			return -1;
600219820Sjeff		}
601219820Sjeff	}
602219820Sjeff
603219820Sjeff	for (qp->rq.wqe_shift = 4;
604219820Sjeff	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
605219820Sjeff	     qp->rq.wqe_shift++)
606219820Sjeff		; /* nothing */
607219820Sjeff
608219820Sjeff	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
609219820Sjeff		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
610219820Sjeff	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
611219820Sjeff		qp->rq.offset = 0;
612219820Sjeff		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
613219820Sjeff	} else {
614219820Sjeff		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
615219820Sjeff		qp->sq.offset = 0;
616219820Sjeff	}
617219820Sjeff
618219820Sjeff	if (mlx4_alloc_buf(&qp->buf,
619219820Sjeff			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
620219820Sjeff			    to_mdev(pd->context->device)->page_size)) {
621219820Sjeff		free(qp->sq.wrid);
622219820Sjeff		free(qp->rq.wrid);
623219820Sjeff		return -1;
624219820Sjeff	}
625219820Sjeff
626219820Sjeff	memset(qp->buf.buf, 0, qp->buf_size);
627219820Sjeff
628219820Sjeff	return 0;
629219820Sjeff}
630219820Sjeff
631219820Sjeffvoid mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
632219820Sjeff		       enum ibv_qp_type type)
633219820Sjeff{
634219820Sjeff	int wqe_size;
635219820Sjeff	struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
636219820Sjeff
637219820Sjeff	wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) -
638219820Sjeff		sizeof (struct mlx4_wqe_ctrl_seg);
639219820Sjeff	switch (type) {
640219820Sjeff	case IBV_QPT_UD:
641219820Sjeff		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
642219820Sjeff		break;
643219820Sjeff
644219820Sjeff	case IBV_QPT_UC:
645219820Sjeff	case IBV_QPT_RC:
646219820Sjeff	case IBV_QPT_XRC:
647219820Sjeff		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
648219820Sjeff		break;
649219820Sjeff
650219820Sjeff	default:
651219820Sjeff		break;
652219820Sjeff	}
653219820Sjeff
654219820Sjeff	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
655219820Sjeff	cap->max_send_sge    = min(ctx->max_sge, qp->sq.max_gs);
656219820Sjeff	qp->sq.max_post	     = min(ctx->max_qp_wr,
657219820Sjeff				   qp->sq.wqe_cnt - qp->sq_spare_wqes);
658219820Sjeff	cap->max_send_wr     = qp->sq.max_post;
659219820Sjeff
660219820Sjeff	/*
661219820Sjeff	 * Inline data segments can't cross a 64 byte boundary.  So
662219820Sjeff	 * subtract off one segment header for each 64-byte chunk,
663219820Sjeff	 * taking into account the fact that wqe_size will be 32 mod
664219820Sjeff	 * 64 for non-UD QPs.
665219820Sjeff	 */
666219820Sjeff	qp->max_inline_data  = wqe_size -
667219820Sjeff		sizeof (struct mlx4_wqe_inline_seg) *
668219820Sjeff		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
669219820Sjeff	cap->max_inline_data = qp->max_inline_data;
670219820Sjeff}
671219820Sjeff
672219820Sjeffstruct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
673219820Sjeff{
674219820Sjeff	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
675219820Sjeff
676219820Sjeff	if (ctx->qp_table[tind].refcnt)
677219820Sjeff		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
678219820Sjeff	else
679219820Sjeff		return NULL;
680219820Sjeff}
681219820Sjeff
682219820Sjeffint mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
683219820Sjeff{
684219820Sjeff	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
685219820Sjeff
686219820Sjeff	if (!ctx->qp_table[tind].refcnt) {
687219820Sjeff		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
688219820Sjeff						   sizeof (struct mlx4_qp *));
689219820Sjeff		if (!ctx->qp_table[tind].table)
690219820Sjeff			return -1;
691219820Sjeff	}
692219820Sjeff
693219820Sjeff	++ctx->qp_table[tind].refcnt;
694219820Sjeff	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
695219820Sjeff	return 0;
696219820Sjeff}
697219820Sjeff
698219820Sjeffvoid mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
699219820Sjeff{
700219820Sjeff	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
701219820Sjeff
702219820Sjeff	if (!--ctx->qp_table[tind].refcnt)
703219820Sjeff		free(ctx->qp_table[tind].table);
704219820Sjeff	else
705219820Sjeff		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
706219820Sjeff}
707