1321936Shselasky/*
2321936Shselasky * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3321936Shselasky * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4321936Shselasky * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
5321936Shselasky *
6321936Shselasky * This software is available to you under a choice of one of two
7321936Shselasky * licenses.  You may choose to be licensed under the terms of the GNU
8321936Shselasky * General Public License (GPL) Version 2, available from the file
9321936Shselasky * COPYING in the main directory of this source tree, or the
10321936Shselasky * OpenIB.org BSD license below:
11321936Shselasky *
12321936Shselasky *     Redistribution and use in source and binary forms, with or
13321936Shselasky *     without modification, are permitted provided that the following
14321936Shselasky *     conditions are met:
15321936Shselasky *
16321936Shselasky *      - Redistributions of source code must retain the above
17321936Shselasky *        copyright notice, this list of conditions and the following
18321936Shselasky *        disclaimer.
19321936Shselasky *
20321936Shselasky *      - Redistributions in binary form must reproduce the above
21321936Shselasky *        copyright notice, this list of conditions and the following
22321936Shselasky *        disclaimer in the documentation and/or other materials
23321936Shselasky *        provided with the distribution.
24321936Shselasky *
25321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32321936Shselasky * SOFTWARE.
33321936Shselasky */
34321936Shselasky
35321936Shselasky#include <config.h>
36321936Shselasky
37321936Shselasky#include <stdlib.h>
38321936Shselasky#include <pthread.h>
39321936Shselasky#include <string.h>
40321936Shselasky#include <errno.h>
41321936Shselasky
42321936Shselasky#include "mlx4.h"
43321936Shselasky#include "doorbell.h"
44321936Shselasky#include "wqe.h"
45321936Shselasky
46321936Shselaskystatic const uint32_t mlx4_ib_opcode[] = {
47321936Shselasky	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
48321936Shselasky	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
49321936Shselasky	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
50321936Shselasky	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
51321936Shselasky	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
52321936Shselasky	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
53321936Shselasky	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
54321936Shselasky	[IBV_WR_LOCAL_INV]		= MLX4_OPCODE_LOCAL_INVAL,
55321936Shselasky	[IBV_WR_BIND_MW]		= MLX4_OPCODE_BIND_MW,
56321936Shselasky	[IBV_WR_SEND_WITH_INV]		= MLX4_OPCODE_SEND_INVAL,
57321936Shselasky};
58321936Shselasky
59321936Shselaskystatic void *get_recv_wqe(struct mlx4_qp *qp, int n)
60321936Shselasky{
61321936Shselasky	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
62321936Shselasky}
63321936Shselasky
64321936Shselaskystatic void *get_send_wqe(struct mlx4_qp *qp, int n)
65321936Shselasky{
66321936Shselasky	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
67321936Shselasky}
68321936Shselasky
69321936Shselasky/*
70321936Shselasky * Stamp a SQ WQE so that it is invalid if prefetched by marking the
71321936Shselasky * first four bytes of every 64 byte chunk with 0xffffffff, except for
72321936Shselasky * the very first chunk of the WQE.
73321936Shselasky */
74321936Shselaskystatic void stamp_send_wqe(struct mlx4_qp *qp, int n)
75321936Shselasky{
76321936Shselasky	uint32_t *wqe = get_send_wqe(qp, n);
77321936Shselasky	int i;
78321936Shselasky	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
79321936Shselasky
80321936Shselasky	for (i = 16; i < ds; i += 16)
81321936Shselasky		wqe[i] = 0xffffffff;
82321936Shselasky}
83321936Shselasky
84321936Shselaskyvoid mlx4_init_qp_indices(struct mlx4_qp *qp)
85321936Shselasky{
86321936Shselasky	qp->sq.head	 = 0;
87321936Shselasky	qp->sq.tail	 = 0;
88321936Shselasky	qp->rq.head	 = 0;
89321936Shselasky	qp->rq.tail	 = 0;
90321936Shselasky}
91321936Shselasky
92321936Shselaskyvoid mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
93321936Shselasky{
94321936Shselasky	struct mlx4_wqe_ctrl_seg *ctrl;
95321936Shselasky	int i;
96321936Shselasky
97321936Shselasky	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
98321936Shselasky		ctrl = get_send_wqe(qp, i);
99321936Shselasky		ctrl->owner_opcode = htobe32(1 << 31);
100321936Shselasky		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
101321936Shselasky
102321936Shselasky		stamp_send_wqe(qp, i);
103321936Shselasky	}
104321936Shselasky}
105321936Shselasky
106321936Shselaskystatic int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
107321936Shselasky{
108321936Shselasky	unsigned cur;
109321936Shselasky
110321936Shselasky	cur = wq->head - wq->tail;
111321936Shselasky	if (cur + nreq < wq->max_post)
112321936Shselasky		return 0;
113321936Shselasky
114321936Shselasky	pthread_spin_lock(&cq->lock);
115321936Shselasky	cur = wq->head - wq->tail;
116321936Shselasky	pthread_spin_unlock(&cq->lock);
117321936Shselasky
118321936Shselasky	return cur + nreq >= wq->max_post;
119321936Shselasky}
120321936Shselasky
121321936Shselaskystatic void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
122321936Shselasky{
123321936Shselasky	int acc = wr->bind_mw.bind_info.mw_access_flags;
124321936Shselasky	bseg->flags1 = 0;
125321936Shselasky	if (acc & IBV_ACCESS_REMOTE_ATOMIC)
126321936Shselasky		bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
127321936Shselasky	if (acc & IBV_ACCESS_REMOTE_WRITE)
128321936Shselasky		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
129321936Shselasky	if (acc & IBV_ACCESS_REMOTE_READ)
130321936Shselasky		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
131321936Shselasky
132321936Shselasky	bseg->flags2 = 0;
133321936Shselasky	if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
134321936Shselasky		bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
135321936Shselasky	if (acc & IBV_ACCESS_ZERO_BASED)
136321936Shselasky		bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
137321936Shselasky
138321936Shselasky	bseg->new_rkey = htobe32(wr->bind_mw.rkey);
139321936Shselasky	bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
140321936Shselasky	bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
141321936Shselasky	bseg->length = htobe64(wr->bind_mw.bind_info.length);
142321936Shselasky}
143321936Shselasky
144321936Shselaskystatic inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
145321936Shselasky		uint32_t rkey)
146321936Shselasky{
147321936Shselasky	iseg->mem_key	= htobe32(rkey);
148321936Shselasky
149321936Shselasky	iseg->reserved1    = 0;
150321936Shselasky	iseg->reserved2    = 0;
151321936Shselasky	iseg->reserved3[0] = 0;
152321936Shselasky	iseg->reserved3[1] = 0;
153321936Shselasky}
154321936Shselasky
155321936Shselaskystatic inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
156321936Shselasky				 uint64_t remote_addr, uint32_t rkey)
157321936Shselasky{
158321936Shselasky	rseg->raddr    = htobe64(remote_addr);
159321936Shselasky	rseg->rkey     = htobe32(rkey);
160321936Shselasky	rseg->reserved = 0;
161321936Shselasky}
162321936Shselasky
163321936Shselaskystatic void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
164321936Shselasky{
165321936Shselasky	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166321936Shselasky		aseg->swap_add = htobe64(wr->wr.atomic.swap);
167321936Shselasky		aseg->compare  = htobe64(wr->wr.atomic.compare_add);
168321936Shselasky	} else {
169321936Shselasky		aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
170321936Shselasky		aseg->compare  = 0;
171321936Shselasky	}
172321936Shselasky
173321936Shselasky}
174321936Shselasky
175321936Shselaskystatic void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
176321936Shselasky			     struct ibv_send_wr *wr)
177321936Shselasky{
178321936Shselasky	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
179321936Shselasky	dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
180321936Shselasky	dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
181321936Shselasky	dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
182321936Shselasky	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
183321936Shselasky}
184321936Shselasky
185321936Shselaskystatic void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
186321936Shselasky{
187321936Shselasky	dseg->byte_count = htobe32(sg->length);
188321936Shselasky	dseg->lkey       = htobe32(sg->lkey);
189321936Shselasky	dseg->addr       = htobe64(sg->addr);
190321936Shselasky}
191321936Shselasky
192321936Shselaskystatic void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
193321936Shselasky{
194321936Shselasky	dseg->lkey       = htobe32(sg->lkey);
195321936Shselasky	dseg->addr       = htobe64(sg->addr);
196321936Shselasky
197321936Shselasky	/*
198321936Shselasky	 * Need a barrier here before writing the byte_count field to
199321936Shselasky	 * make sure that all the data is visible before the
200321936Shselasky	 * byte_count field is set.  Otherwise, if the segment begins
201321936Shselasky	 * a new cacheline, the HCA prefetcher could grab the 64-byte
202321936Shselasky	 * chunk and get a valid (!= * 0xffffffff) byte count but
203321936Shselasky	 * stale data, and end up sending the wrong data.
204321936Shselasky	 */
205321936Shselasky	udma_to_device_barrier();
206321936Shselasky
207321936Shselasky	if (likely(sg->length))
208321936Shselasky		dseg->byte_count = htobe32(sg->length);
209321936Shselasky	else
210321936Shselasky		dseg->byte_count = htobe32(0x80000000);
211321936Shselasky}
212321936Shselasky
213321936Shselaskyint mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
214321936Shselasky			  struct ibv_send_wr **bad_wr)
215321936Shselasky{
216321936Shselasky	struct mlx4_context *ctx;
217321936Shselasky	struct mlx4_qp *qp = to_mqp(ibqp);
218321936Shselasky	void *wqe;
219321936Shselasky	struct mlx4_wqe_ctrl_seg *ctrl = NULL;
220321936Shselasky	int ind;
221321936Shselasky	int nreq;
222321936Shselasky	int inl = 0;
223321936Shselasky	int ret = 0;
224321936Shselasky	int size = 0;
225321936Shselasky	int i;
226321936Shselasky
227321936Shselasky	pthread_spin_lock(&qp->sq.lock);
228321936Shselasky
229321936Shselasky	/* XXX check that state is OK to post send */
230321936Shselasky
231321936Shselasky	ind = qp->sq.head;
232321936Shselasky
233321936Shselasky	for (nreq = 0; wr; ++nreq, wr = wr->next) {
234321936Shselasky		if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
235321936Shselasky			ret = ENOMEM;
236321936Shselasky			*bad_wr = wr;
237321936Shselasky			goto out;
238321936Shselasky		}
239321936Shselasky
240321936Shselasky		if (wr->num_sge > qp->sq.max_gs) {
241321936Shselasky			ret = ENOMEM;
242321936Shselasky			*bad_wr = wr;
243321936Shselasky			goto out;
244321936Shselasky		}
245321936Shselasky
246321936Shselasky		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
247321936Shselasky			ret = EINVAL;
248321936Shselasky			*bad_wr = wr;
249321936Shselasky			goto out;
250321936Shselasky		}
251321936Shselasky
252321936Shselasky		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
253321936Shselasky		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
254321936Shselasky
255321936Shselasky		ctrl->srcrb_flags =
256321936Shselasky			(wr->send_flags & IBV_SEND_SIGNALED ?
257321936Shselasky			 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
258321936Shselasky			(wr->send_flags & IBV_SEND_SOLICITED ?
259321936Shselasky			 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
260321936Shselasky			qp->sq_signal_bits;
261321936Shselasky
262321936Shselasky		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
263321936Shselasky		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
264321936Shselasky			ctrl->imm = wr->imm_data;
265321936Shselasky		else
266321936Shselasky			ctrl->imm = 0;
267321936Shselasky
268321936Shselasky		wqe += sizeof *ctrl;
269321936Shselasky		size = sizeof *ctrl / 16;
270321936Shselasky
271321936Shselasky		switch (ibqp->qp_type) {
272321936Shselasky		case IBV_QPT_XRC_SEND:
273321936Shselasky			ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
274321936Shselasky			/* fall through */
275321936Shselasky		case IBV_QPT_RC:
276321936Shselasky		case IBV_QPT_UC:
277321936Shselasky			switch (wr->opcode) {
278321936Shselasky			case IBV_WR_ATOMIC_CMP_AND_SWP:
279321936Shselasky			case IBV_WR_ATOMIC_FETCH_AND_ADD:
280321936Shselasky				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
281321936Shselasky					      wr->wr.atomic.rkey);
282321936Shselasky				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
283321936Shselasky
284321936Shselasky				set_atomic_seg(wqe, wr);
285321936Shselasky				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
286321936Shselasky				size += (sizeof (struct mlx4_wqe_raddr_seg) +
287321936Shselasky					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
288321936Shselasky
289321936Shselasky				break;
290321936Shselasky
291321936Shselasky			case IBV_WR_RDMA_READ:
292321936Shselasky				inl = 1;
293321936Shselasky				/* fall through */
294321936Shselasky			case IBV_WR_RDMA_WRITE:
295321936Shselasky			case IBV_WR_RDMA_WRITE_WITH_IMM:
296321936Shselasky				if (!wr->num_sge)
297321936Shselasky					inl = 1;
298321936Shselasky				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
299321936Shselasky					      wr->wr.rdma.rkey);
300321936Shselasky				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
301321936Shselasky				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
302321936Shselasky
303321936Shselasky				break;
304321936Shselasky			case IBV_WR_LOCAL_INV:
305321936Shselasky				ctrl->srcrb_flags |=
306321936Shselasky					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
307321936Shselasky				set_local_inv_seg(wqe, wr->imm_data);
308321936Shselasky				wqe  += sizeof
309321936Shselasky					(struct mlx4_wqe_local_inval_seg);
310321936Shselasky				size += sizeof
311321936Shselasky					(struct mlx4_wqe_local_inval_seg) / 16;
312321936Shselasky				break;
313321936Shselasky			case IBV_WR_BIND_MW:
314321936Shselasky				ctrl->srcrb_flags |=
315321936Shselasky					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
316321936Shselasky				set_bind_seg(wqe, wr);
317321936Shselasky				wqe  += sizeof
318321936Shselasky					(struct mlx4_wqe_bind_seg);
319321936Shselasky				size += sizeof
320321936Shselasky					(struct mlx4_wqe_bind_seg) / 16;
321321936Shselasky				break;
322321936Shselasky			case IBV_WR_SEND_WITH_INV:
323321936Shselasky				ctrl->imm = htobe32(wr->imm_data);
324321936Shselasky				break;
325321936Shselasky
326321936Shselasky			default:
327321936Shselasky				/* No extra segments required for sends */
328321936Shselasky				break;
329321936Shselasky			}
330321936Shselasky			break;
331321936Shselasky
332321936Shselasky		case IBV_QPT_UD:
333321936Shselasky			set_datagram_seg(wqe, wr);
334321936Shselasky			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
335321936Shselasky			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
336321936Shselasky
337321936Shselasky			if (wr->send_flags & IBV_SEND_IP_CSUM) {
338321936Shselasky				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
339321936Shselasky					ret = EINVAL;
340321936Shselasky					*bad_wr = wr;
341321936Shselasky					goto out;
342321936Shselasky				}
343321936Shselasky				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
344321936Shselasky							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
345321936Shselasky			}
346321936Shselasky			break;
347321936Shselasky
348321936Shselasky		case IBV_QPT_RAW_PACKET:
349321936Shselasky			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
350321936Shselasky			 * to indicate that no icrc should be calculated */
351321936Shselasky			ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
352321936Shselasky			if (wr->send_flags & IBV_SEND_IP_CSUM) {
353321936Shselasky				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
354321936Shselasky					ret = EINVAL;
355321936Shselasky					*bad_wr = wr;
356321936Shselasky					goto out;
357321936Shselasky				}
358321936Shselasky				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
359321936Shselasky							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
360321936Shselasky			}
361321936Shselasky			break;
362321936Shselasky
363321936Shselasky		default:
364321936Shselasky			break;
365321936Shselasky		}
366321936Shselasky
367321936Shselasky		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
368321936Shselasky			struct mlx4_wqe_inline_seg *seg;
369321936Shselasky			void *addr;
370321936Shselasky			int len, seg_len;
371321936Shselasky			int num_seg;
372321936Shselasky			int off, to_copy;
373321936Shselasky
374321936Shselasky			inl = 0;
375321936Shselasky
376321936Shselasky			seg = wqe;
377321936Shselasky			wqe += sizeof *seg;
378321936Shselasky			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
379321936Shselasky			num_seg = 0;
380321936Shselasky			seg_len = 0;
381321936Shselasky
382321936Shselasky			for (i = 0; i < wr->num_sge; ++i) {
383321936Shselasky				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
384321936Shselasky				len  = wr->sg_list[i].length;
385321936Shselasky				inl += len;
386321936Shselasky
387321936Shselasky				if (inl > qp->max_inline_data) {
388321936Shselasky					inl = 0;
389321936Shselasky					ret = ENOMEM;
390321936Shselasky					*bad_wr = wr;
391321936Shselasky					goto out;
392321936Shselasky				}
393321936Shselasky
394321936Shselasky				while (len >= MLX4_INLINE_ALIGN - off) {
395321936Shselasky					to_copy = MLX4_INLINE_ALIGN - off;
396321936Shselasky					memcpy(wqe, addr, to_copy);
397321936Shselasky					len -= to_copy;
398321936Shselasky					wqe += to_copy;
399321936Shselasky					addr += to_copy;
400321936Shselasky					seg_len += to_copy;
401321936Shselasky					udma_to_device_barrier(); /* see comment below */
402321936Shselasky					seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
403321936Shselasky					seg_len = 0;
404321936Shselasky					seg = wqe;
405321936Shselasky					wqe += sizeof *seg;
406321936Shselasky					off = sizeof *seg;
407321936Shselasky					++num_seg;
408321936Shselasky				}
409321936Shselasky
410321936Shselasky				memcpy(wqe, addr, len);
411321936Shselasky				wqe += len;
412321936Shselasky				seg_len += len;
413321936Shselasky				off += len;
414321936Shselasky			}
415321936Shselasky
416321936Shselasky			if (seg_len) {
417321936Shselasky				++num_seg;
418321936Shselasky				/*
419321936Shselasky				 * Need a barrier here to make sure
420321936Shselasky				 * all the data is visible before the
421321936Shselasky				 * byte_count field is set.  Otherwise
422321936Shselasky				 * the HCA prefetcher could grab the
423321936Shselasky				 * 64-byte chunk with this inline
424321936Shselasky				 * segment and get a valid (!=
425321936Shselasky				 * 0xffffffff) byte count but stale
426321936Shselasky				 * data, and end up sending the wrong
427321936Shselasky				 * data.
428321936Shselasky				 */
429321936Shselasky				udma_to_device_barrier();
430321936Shselasky				seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
431321936Shselasky			}
432321936Shselasky
433321936Shselasky			size += (inl + num_seg * sizeof * seg + 15) / 16;
434321936Shselasky		} else {
435321936Shselasky			struct mlx4_wqe_data_seg *seg = wqe;
436321936Shselasky
437321936Shselasky			for (i = wr->num_sge - 1; i >= 0 ; --i)
438321936Shselasky				set_data_seg(seg + i, wr->sg_list + i);
439321936Shselasky
440321936Shselasky			size += wr->num_sge * (sizeof *seg / 16);
441321936Shselasky		}
442321936Shselasky
443321936Shselasky		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
444321936Shselasky				    MLX4_WQE_CTRL_FENCE : 0) | size;
445321936Shselasky
446321936Shselasky		/*
447321936Shselasky		 * Make sure descriptor is fully written before
448321936Shselasky		 * setting ownership bit (because HW can start
449321936Shselasky		 * executing as soon as we do).
450321936Shselasky		 */
451321936Shselasky		udma_to_device_barrier();
452321936Shselasky
453321936Shselasky		ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
454321936Shselasky			(ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
455321936Shselasky
456321936Shselasky		/*
457321936Shselasky		 * We can improve latency by not stamping the last
458321936Shselasky		 * send queue WQE until after ringing the doorbell, so
459321936Shselasky		 * only stamp here if there are still more WQEs to post.
460321936Shselasky		 */
461321936Shselasky		if (wr->next)
462321936Shselasky			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
463321936Shselasky				       (qp->sq.wqe_cnt - 1));
464321936Shselasky
465321936Shselasky		++ind;
466321936Shselasky	}
467321936Shselasky
468321936Shselaskyout:
469321936Shselasky	ctx = to_mctx(ibqp->context);
470321936Shselasky
471321936Shselasky	if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
472321936Shselasky		ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
473321936Shselasky
474321936Shselasky		ctrl->bf_qpn |= qp->doorbell_qpn;
475321936Shselasky		++qp->sq.head;
476321936Shselasky		/*
477321936Shselasky		 * Make sure that descriptor is written to memory
478321936Shselasky		 * before writing to BlueFlame page.
479321936Shselasky		 */
480321936Shselasky		mmio_wc_spinlock(&ctx->bf_lock);
481321936Shselasky
482321936Shselasky		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
483321936Shselasky			     align(size * 16, 64));
484321936Shselasky		/* Flush before toggling bf_offset to be latency oriented */
485321936Shselasky		mmio_flush_writes();
486321936Shselasky
487321936Shselasky		ctx->bf_offset ^= ctx->bf_buf_size;
488321936Shselasky
489321936Shselasky		pthread_spin_unlock(&ctx->bf_lock);
490321936Shselasky	} else if (nreq) {
491321936Shselasky		qp->sq.head += nreq;
492321936Shselasky
493321936Shselasky		/*
494321936Shselasky		 * Make sure that descriptors are written before
495321936Shselasky		 * doorbell record.
496321936Shselasky		 */
497321936Shselasky		udma_to_device_barrier();
498321936Shselasky
499321936Shselasky		mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
500321936Shselasky			    qp->doorbell_qpn);
501321936Shselasky	}
502321936Shselasky
503321936Shselasky	if (nreq)
504321936Shselasky		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
505321936Shselasky			       (qp->sq.wqe_cnt - 1));
506321936Shselasky
507321936Shselasky	pthread_spin_unlock(&qp->sq.lock);
508321936Shselasky
509321936Shselasky	return ret;
510321936Shselasky}
511321936Shselasky
512321936Shselaskyint mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
513321936Shselasky		   struct ibv_recv_wr **bad_wr)
514321936Shselasky{
515321936Shselasky	struct mlx4_qp *qp = to_mqp(ibqp);
516321936Shselasky	struct mlx4_wqe_data_seg *scat;
517321936Shselasky	int ret = 0;
518321936Shselasky	int nreq;
519321936Shselasky	int ind;
520321936Shselasky	int i;
521321936Shselasky
522321936Shselasky	pthread_spin_lock(&qp->rq.lock);
523321936Shselasky
524321936Shselasky	/* XXX check that state is OK to post receive */
525321936Shselasky
526321936Shselasky	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
527321936Shselasky
528321936Shselasky	for (nreq = 0; wr; ++nreq, wr = wr->next) {
529321936Shselasky		if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
530321936Shselasky			ret = ENOMEM;
531321936Shselasky			*bad_wr = wr;
532321936Shselasky			goto out;
533321936Shselasky		}
534321936Shselasky
535321936Shselasky		if (wr->num_sge > qp->rq.max_gs) {
536321936Shselasky			ret = ENOMEM;
537321936Shselasky			*bad_wr = wr;
538321936Shselasky			goto out;
539321936Shselasky		}
540321936Shselasky
541321936Shselasky		scat = get_recv_wqe(qp, ind);
542321936Shselasky
543321936Shselasky		for (i = 0; i < wr->num_sge; ++i)
544321936Shselasky			__set_data_seg(scat + i, wr->sg_list + i);
545321936Shselasky
546321936Shselasky		if (i < qp->rq.max_gs) {
547321936Shselasky			scat[i].byte_count = 0;
548321936Shselasky			scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
549321936Shselasky			scat[i].addr       = 0;
550321936Shselasky		}
551321936Shselasky
552321936Shselasky		qp->rq.wrid[ind] = wr->wr_id;
553321936Shselasky
554321936Shselasky		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
555321936Shselasky	}
556321936Shselasky
557321936Shselaskyout:
558321936Shselasky	if (nreq) {
559321936Shselasky		qp->rq.head += nreq;
560321936Shselasky
561321936Shselasky		/*
562321936Shselasky		 * Make sure that descriptors are written before
563321936Shselasky		 * doorbell record.
564321936Shselasky		 */
565321936Shselasky		udma_to_device_barrier();
566321936Shselasky
567321936Shselasky		*qp->db = htobe32(qp->rq.head & 0xffff);
568321936Shselasky	}
569321936Shselasky
570321936Shselasky	pthread_spin_unlock(&qp->rq.lock);
571321936Shselasky
572321936Shselasky	return ret;
573321936Shselasky}
574321936Shselasky
575321936Shselaskystatic int num_inline_segs(int data, enum ibv_qp_type type)
576321936Shselasky{
577321936Shselasky	/*
578321936Shselasky	 * Inline data segments are not allowed to cross 64 byte
579321936Shselasky	 * boundaries.  For UD QPs, the data segments always start
580321936Shselasky	 * aligned to 64 bytes (16 byte control segment + 48 byte
581321936Shselasky	 * datagram segment); for other QPs, there will be a 16 byte
582321936Shselasky	 * control segment and possibly a 16 byte remote address
583321936Shselasky	 * segment, so in the worst case there will be only 32 bytes
584321936Shselasky	 * available for the first data segment.
585321936Shselasky	 */
586321936Shselasky	if (type == IBV_QPT_UD)
587321936Shselasky		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
588321936Shselasky			 sizeof (struct mlx4_wqe_datagram_seg)) %
589321936Shselasky			MLX4_INLINE_ALIGN;
590321936Shselasky	else
591321936Shselasky		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
592321936Shselasky			 sizeof (struct mlx4_wqe_raddr_seg)) %
593321936Shselasky			MLX4_INLINE_ALIGN;
594321936Shselasky
595321936Shselasky	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
596321936Shselasky		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
597321936Shselasky}
598321936Shselasky
599321936Shselaskyvoid mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
600321936Shselasky			   struct mlx4_qp *qp)
601321936Shselasky{
602321936Shselasky	int size;
603321936Shselasky	int max_sq_sge;
604321936Shselasky
605321936Shselasky	max_sq_sge	 = align(cap->max_inline_data +
606321936Shselasky				 num_inline_segs(cap->max_inline_data, type) *
607321936Shselasky				 sizeof (struct mlx4_wqe_inline_seg),
608321936Shselasky				 sizeof (struct mlx4_wqe_data_seg)) /
609321936Shselasky		sizeof (struct mlx4_wqe_data_seg);
610321936Shselasky	if (max_sq_sge < cap->max_send_sge)
611321936Shselasky		max_sq_sge = cap->max_send_sge;
612321936Shselasky
613321936Shselasky	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
614321936Shselasky	switch (type) {
615321936Shselasky	case IBV_QPT_UD:
616321936Shselasky		size += sizeof (struct mlx4_wqe_datagram_seg);
617321936Shselasky		break;
618321936Shselasky
619321936Shselasky	case IBV_QPT_UC:
620321936Shselasky		size += sizeof (struct mlx4_wqe_raddr_seg);
621321936Shselasky		break;
622321936Shselasky
623321936Shselasky	case IBV_QPT_XRC_SEND:
624321936Shselasky	case IBV_QPT_RC:
625321936Shselasky		size += sizeof (struct mlx4_wqe_raddr_seg);
626321936Shselasky		/*
627321936Shselasky		 * An atomic op will require an atomic segment, a
628321936Shselasky		 * remote address segment and one scatter entry.
629321936Shselasky		 */
630321936Shselasky		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
631321936Shselasky			    sizeof (struct mlx4_wqe_raddr_seg) +
632321936Shselasky			    sizeof (struct mlx4_wqe_data_seg)))
633321936Shselasky			size = (sizeof (struct mlx4_wqe_atomic_seg) +
634321936Shselasky				sizeof (struct mlx4_wqe_raddr_seg) +
635321936Shselasky				sizeof (struct mlx4_wqe_data_seg));
636321936Shselasky		break;
637321936Shselasky
638321936Shselasky	default:
639321936Shselasky		break;
640321936Shselasky	}
641321936Shselasky
642321936Shselasky	/* Make sure that we have enough space for a bind request */
643321936Shselasky	if (size < sizeof (struct mlx4_wqe_bind_seg))
644321936Shselasky		size = sizeof (struct mlx4_wqe_bind_seg);
645321936Shselasky
646321936Shselasky	size += sizeof (struct mlx4_wqe_ctrl_seg);
647321936Shselasky
648321936Shselasky	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
649321936Shselasky	     qp->sq.wqe_shift++)
650321936Shselasky		; /* nothing */
651321936Shselasky}
652321936Shselasky
653321936Shselaskyint mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
654321936Shselasky		       enum ibv_qp_type type, struct mlx4_qp *qp)
655321936Shselasky{
656321936Shselasky	qp->rq.max_gs	 = cap->max_recv_sge;
657321936Shselasky
658321936Shselasky	if (qp->sq.wqe_cnt) {
659321936Shselasky		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
660321936Shselasky		if (!qp->sq.wrid)
661321936Shselasky			return -1;
662321936Shselasky	}
663321936Shselasky
664321936Shselasky	if (qp->rq.wqe_cnt) {
665321936Shselasky		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
666321936Shselasky		if (!qp->rq.wrid) {
667321936Shselasky			free(qp->sq.wrid);
668321936Shselasky			return -1;
669321936Shselasky		}
670321936Shselasky	}
671321936Shselasky
672321936Shselasky	for (qp->rq.wqe_shift = 4;
673321936Shselasky	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
674321936Shselasky	     qp->rq.wqe_shift++)
675321936Shselasky		; /* nothing */
676321936Shselasky
677321936Shselasky	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
678321936Shselasky		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
679321936Shselasky	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
680321936Shselasky		qp->rq.offset = 0;
681321936Shselasky		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
682321936Shselasky	} else {
683321936Shselasky		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
684321936Shselasky		qp->sq.offset = 0;
685321936Shselasky	}
686321936Shselasky
687321936Shselasky	if (qp->buf_size) {
688321936Shselasky		if (mlx4_alloc_buf(&qp->buf,
689321936Shselasky				   align(qp->buf_size, to_mdev(context->device)->page_size),
690321936Shselasky				   to_mdev(context->device)->page_size)) {
691321936Shselasky			free(qp->sq.wrid);
692321936Shselasky			free(qp->rq.wrid);
693321936Shselasky			return -1;
694321936Shselasky		}
695321936Shselasky
696321936Shselasky		memset(qp->buf.buf, 0, qp->buf_size);
697321936Shselasky	} else {
698321936Shselasky		qp->buf.buf = NULL;
699321936Shselasky	}
700321936Shselasky
701321936Shselasky	return 0;
702321936Shselasky}
703321936Shselasky
704321936Shselaskyvoid mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
705321936Shselasky		       enum ibv_qp_type type)
706321936Shselasky{
707321936Shselasky	int wqe_size;
708321936Shselasky
709321936Shselasky	wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
710321936Shselasky	switch (type) {
711321936Shselasky	case IBV_QPT_UD:
712321936Shselasky		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
713321936Shselasky		break;
714321936Shselasky
715321936Shselasky	case IBV_QPT_XRC_SEND:
716321936Shselasky	case IBV_QPT_UC:
717321936Shselasky	case IBV_QPT_RC:
718321936Shselasky		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
719321936Shselasky		break;
720321936Shselasky
721321936Shselasky	default:
722321936Shselasky		break;
723321936Shselasky	}
724321936Shselasky
725321936Shselasky	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
726321936Shselasky	cap->max_send_sge    = qp->sq.max_gs;
727321936Shselasky	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
728321936Shselasky	cap->max_send_wr     = qp->sq.max_post;
729321936Shselasky
730321936Shselasky	/*
731321936Shselasky	 * Inline data segments can't cross a 64 byte boundary.  So
732321936Shselasky	 * subtract off one segment header for each 64-byte chunk,
733321936Shselasky	 * taking into account the fact that wqe_size will be 32 mod
734321936Shselasky	 * 64 for non-UD QPs.
735321936Shselasky	 */
736321936Shselasky	qp->max_inline_data  = wqe_size -
737321936Shselasky		sizeof (struct mlx4_wqe_inline_seg) *
738321936Shselasky		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
739321936Shselasky	cap->max_inline_data = qp->max_inline_data;
740321936Shselasky}
741321936Shselasky
742321936Shselaskystruct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
743321936Shselasky{
744321936Shselasky	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
745321936Shselasky
746321936Shselasky	if (ctx->qp_table[tind].refcnt)
747321936Shselasky		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
748321936Shselasky	else
749321936Shselasky		return NULL;
750321936Shselasky}
751321936Shselasky
752321936Shselaskyint mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
753321936Shselasky{
754321936Shselasky	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
755321936Shselasky
756321936Shselasky	if (!ctx->qp_table[tind].refcnt) {
757321936Shselasky		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
758321936Shselasky						   sizeof (struct mlx4_qp *));
759321936Shselasky		if (!ctx->qp_table[tind].table)
760321936Shselasky			return -1;
761321936Shselasky	}
762321936Shselasky
763321936Shselasky	++ctx->qp_table[tind].refcnt;
764321936Shselasky	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
765321936Shselasky	return 0;
766321936Shselasky}
767321936Shselasky
768321936Shselaskyvoid mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
769321936Shselasky{
770321936Shselasky	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
771321936Shselasky
772321936Shselasky	if (!--ctx->qp_table[tind].refcnt)
773321936Shselasky		free(ctx->qp_table[tind].table);
774321936Shselasky	else
775321936Shselasky		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
776321936Shselasky}
777