1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * This file may contain confidential information of
29 * Mellanox Technologies, Ltd. and should not be distributed in source
30 * form without approval from Sun Legal.
31 */
32
33#include "dapl.h"
34#include "dapl_tavor_hw.h"
35#include "dapl_tavor_wr.h"
36#include "dapl_tavor_ibtf_impl.h"
37
38/*
39 * Function signatures
40 */
41extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
42    uint_t, uint_t, dapls_tavor_wrid_entry_t *);
43extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
44extern DAPL_OS_LOCK g_tavor_uar_lock;
45
46#ifndef	_LP64
47extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
48#endif
49
50static int dapli_tavor_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
51    uint64_t *, uint_t *);
52static void dapli_tavor_wqe_send_linknext(ibt_send_wr_t *, uint64_t *,
53    boolean_t, uint32_t, uint_t, uint64_t *, tavor_sw_wqe_dbinfo_t *);
54static DAT_RETURN dapli_tavor_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
55    uint64_t *, uint_t *);
56static void dapli_tavor_wqe_recv_linknext(uint64_t *, boolean_t, uint32_t,
57    uint_t, uint64_t *);
58static int dapli_tavor_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
59    ibt_wc_t *);
60static int dapli_tavor_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
61    ibt_wc_t *);
62
63/* exported to other HCAs */
64extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
65    uint32_t, uint_t);
66extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
67
68/*
69 * Note: The 64 bit doorbells need to written atomically.
70 * In 32 bit libraries we need to use the special assembly rtn
71 * because compiler generated code splits into 2 word writes
72 */
73
74#if defined(_LP64) || defined(__lint)
75/* use a macro to ensure inlining on S10 amd64 compiler */
76#define	dapli_tavor_cq_doorbell(ia_uar, cq_cmd, cqn, cq_param) \
77	((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64( \
78	    ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | \
79	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param)
80#else
81
82/*
83 * dapli_tavor_cq_doorbell()
84 * Takes the specified cq cmd and cq number and rings the cq doorbell
85 */
86static void
87dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
88    uint32_t cq_param)
89{
90	uint64_t doorbell;
91
92	/* Build the doorbell from the parameters */
93	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
94	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
95
96	/* Write the doorbell to UAR */
97#ifdef _LP64
98	((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
99	/* 32 bit version */
100#elif defined(i386)
101	dapl_os_lock(&g_tavor_uar_lock);
102	/*
103	 * For 32 bit intel we assign the doorbell in the order
104	 * prescribed by the Tavor PRM, lower to upper addresses
105	 */
106	((tavor_hw_uar32_t *)ia_uar)->cq[0] =
107	    (uint32_t)HTOBE_32(doorbell >> 32);
108	((tavor_hw_uar32_t *)ia_uar)->cq[1] =
109	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
110	dapl_os_unlock(&g_tavor_uar_lock);
111#else
112	dapls_atomic_assign_64(HTOBE_64(doorbell),
113	    &((tavor_hw_uar_t *)ia_uar)->cq);
114#endif
115}
116#pragma inline(dapli_tavor_cq_doorbell)
117
118#endif	/* _LP64 */
119
120#if defined(_LP64) || defined(__lint)
121#define	dapli_tavor_qp_send_doorbell(ia_uar, nda, nds, qpn, fence, nopcode) \
122	((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64( \
123	    (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << \
124	    TAVOR_QPSNDDB_NDA_SHIFT) | \
125	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | \
126	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | \
127	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds)
128#else
129
130/*
131 * dapli_tavor_qp_send_doorbell()
132 * Takes the specified next descriptor information, qp number, opcode and
133 * rings the send doorbell
134 */
135static void
136dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
137    uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode)
138{
139	uint64_t doorbell;
140
141	/* Build the doorbell from the parameters */
142	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
143	    TAVOR_QPSNDDB_NDA_SHIFT) |
144	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
145	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
146	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
147
148	/* Write the doorbell to UAR */
149#ifdef _LP64
150	((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
151#else
152#if defined(i386)
153	dapl_os_lock(&g_tavor_uar_lock);
154	/*
155	 * For 32 bit intel we assign the doorbell in the order
156	 * prescribed by the Tavor PRM, lower to upper addresses
157	 */
158	((tavor_hw_uar32_t *)ia_uar)->send[0] =
159	    (uint32_t)HTOBE_32(doorbell >> 32);
160	((tavor_hw_uar32_t *)ia_uar)->send[1] =
161	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
162	dapl_os_unlock(&g_tavor_uar_lock);
163#else
164	dapls_atomic_assign_64(HTOBE_64(doorbell),
165	    &((tavor_hw_uar_t *)ia_uar)->send);
166#endif
167#endif
168}
169#pragma inline(dapli_tavor_qp_send_doorbell)
170#endif	/* _LP64 */
171
172#if defined(_LP64) || defined(__lint)
173
174#define	dapli_tavor_qp_recv_doorbell(ia_uar, nda, nds, qpn, credits) \
175	((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64( \
176	    (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << \
177	    TAVOR_QPRCVDB_NDA_SHIFT) | \
178	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | \
179	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits)
180#else
181
182/*
183 * dapli_tavor_qp_recv_doorbell()
184 * Takes the specified next descriptor information, qp number and
185 * rings the recv doorbell
186 */
187static void
188dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
189    uint32_t nds, uint32_t qpn, uint32_t credits)
190{
191	uint64_t doorbell;
192
193	/* Build the doorbell from the parameters */
194	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
195	    TAVOR_QPRCVDB_NDA_SHIFT) |
196	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
197	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
198
199	/* Write the doorbell to UAR */
200#ifdef _LP64
201	((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64(doorbell);
202#else
203#if defined(i386)
204	dapl_os_lock(&g_tavor_uar_lock);
205	/*
206	 * For 32 bit intel we assign the doorbell in the order
207	 * prescribed by the Tavor PRM, lower to upper addresses
208	 */
209	((tavor_hw_uar32_t *)ia_uar)->recv[0] =
210	    (uint32_t)HTOBE_32(doorbell >> 32);
211	((tavor_hw_uar32_t *)ia_uar)->recv[1] =
212	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
213	dapl_os_unlock(&g_tavor_uar_lock);
214#else
215	dapls_atomic_assign_64(HTOBE_64(doorbell),
216	    &((tavor_hw_uar_t *)ia_uar)->recv);
217#endif
218#endif
219}
220#pragma inline(dapli_tavor_qp_recv_doorbell)
221#endif	/* _LP64 */
222
223
224/*
225 * dapls_tavor_max_inline()
226 * Return the max inline value that should be used.
227 * Env variable DAPL_MAX_INLINE can override the default.
228 * If it's not set (or set to -1), default behavior is used.
229 * If it's zero or negative (except -1) inline is not done.
230 */
231int
232dapls_tavor_max_inline(void)
233{
234	static int max_inline_env = -2;
235
236	/* Check the env exactly once, otherwise return previous value. */
237	if (max_inline_env != -2)
238		return (max_inline_env);
239
240	max_inline_env = dapl_os_get_env_val("DAPL_MAX_INLINE", -1);
241	if (max_inline_env != -1)
242		if (max_inline_env <= 0)
243			max_inline_env = 0;	/* no inlining */
244	return (max_inline_env);
245}
246
247/*
248 * dapls_ib_max_request_iov(), aka, max send sgl size.
249 * The send queue's scatter/gather list is used for "inline" data.
250 *
251 * By default, compute reasonable send queue size based on #iovs, #wqes,
252 * max_iovs, and max inline byte count.  If the #wqes is large, then we
253 * limit how much the SGL (space for inline data) can take.  The heuristic
254 * is to increase the memory for the send queue to a maximum of 32KB:
255 *
256 *	< 128 wqes	increase to at most 256 minus header
257 *	< 256 wqes	increase to at most 128 minus header
258 *	>= 256 wqes	use SGL unaltered
259 *
260 * If the env is supplied (max_inline >= 0), use it without checking.
261 */
262int
263dapls_ib_max_request_iov(int iovs, int wqes, int max_iovs,
264    int max_inline_bytes)
265{
266	int ret_iovs;
267
268	if (max_inline_bytes > 0) {
269		ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
270	} else if (wqes < 128) {
271		max_inline_bytes = 256 - TAVOR_INLINE_HEADER_SIZE_MAX;
272		ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
273	} else if (wqes < 256) {
274		max_inline_bytes = 128 - TAVOR_INLINE_HEADER_SIZE_MAX;
275		ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
276	} else {
277		ret_iovs = iovs;
278	}
279
280	if (ret_iovs > max_iovs)	/* do not exceed max */
281		ret_iovs = max_iovs;
282	if (iovs > ret_iovs)		/* never decrease iovs */
283		ret_iovs = iovs;
284	return (ret_iovs);
285}
286
287/*
288 * dapli_tavor_wqe_send_build()
289 * Constructs a WQE for a given ibt_send_wr_t
290 */
291static int
292dapli_tavor_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
293    uint64_t *addr, uint_t *size)
294{
295	tavor_hw_snd_wqe_remaddr_t	*rc;
296	tavor_hw_snd_wqe_bind_t		*bn;
297	tavor_hw_wqe_sgl_t		*ds;
298	ibt_wr_ds_t			*sgl;
299	uint32_t			nds;
300	uint32_t			len, total_len;
301	uint32_t			tavor_num_mpt_mask;
302	uint32_t			new_rkey;
303	uint32_t			old_rkey;
304	int				i, num_ds;
305	int				max_inline_bytes = -1;
306
307	nds = wr->wr_nds;
308	sgl = wr->wr_sgl;
309	num_ds = 0;
310
311	/*
312	 * RC is the only supported transport in UDAPL
313	 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
314	 */
315	switch (wr->wr_opcode) {
316	case IBT_WRC_SEND:
317		/*
318		 * If this is a Send request, then all we need is
319		 * the Data Segment processing below.
320		 * Initialize the information for the Data Segments
321		 */
322		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
323		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
324		if (qp->qp_sq_inline != 0)
325			max_inline_bytes =
326			    qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
327		break;
328	case IBT_WRC_RDMAW:
329		if (qp->qp_sq_inline != 0)
330			max_inline_bytes =
331			    qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
332		/* FALLTHROUGH */
333	case IBT_WRC_RDMAR:
334		if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR)
335			qp->qp_sq_inline = 0;
336		/*
337		 * If this is an RDMA Read or RDMA Write request, then fill
338		 * in the "Remote Address" header fields.
339		 */
340		rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
341		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
342
343		/*
344		 * Build the Remote Address Segment for the WQE, using
345		 * the information from the RC work request.
346		 */
347		TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
348
349		/* Update "ds" for filling in Data Segments (below) */
350		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
351		    sizeof (tavor_hw_snd_wqe_remaddr_t));
352		break;
353	case IBT_WRC_BIND:
354		/*
355		 * Generate a new R_key
356		 * Increment the upper "unconstrained" bits and need to keep
357		 * the lower "constrained" bits the same it represents
358		 * the MPT index.
359		 */
360		old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
361		tavor_num_mpt_mask = (uint32_t)(1 << qp->qp_num_mpt_shift) - 1;
362		new_rkey = (old_rkey >> qp->qp_num_mpt_shift);
363		new_rkey++;
364		new_rkey = ((new_rkey << qp->qp_num_mpt_shift) |
365		    (old_rkey & tavor_num_mpt_mask));
366
367		wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
368
369		bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
370		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
371
372		/*
373		 * Build the Bind Memory Window Segments for the WQE,
374		 * using the information from the RC Bind memory
375		 * window work request.
376		 */
377		TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
378
379		/*
380		 * Update the "ds" pointer.  Even though the "bind"
381		 * operation requires no SGLs, this is necessary to
382		 * facilitate the correct descriptor size calculations
383		 * (below).
384		 */
385		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
386		    sizeof (tavor_hw_snd_wqe_bind_t));
387		break;
388	default:
389		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
390		    "dapli_tavor_wqe_send_build: invalid wr_opcode=%d\n",
391		    wr->wr_opcode);
392		return (DAT_INTERNAL_ERROR);
393	}
394
395	/*
396	 * Now fill in the Data Segments (SGL) for the Send WQE based on
397	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
398	 * Start by checking for a valid number of SGL entries
399	 */
400	if (nds > qp->qp_sq_sgl) {
401		return (DAT_INVALID_PARAMETER);
402	}
403
404	/*
405	 * For each SGL in the Send Work Request, fill in the Send WQE's data
406	 * segments.  Note: We skip any SGL with zero size because Tavor
407	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
408	 * the encoding for zero means a 2GB transfer.  Because of this special
409	 * encoding in the hardware, we mask the requested length with
410	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
411	 * zero.)
412	 */
413
414	if (max_inline_bytes != -1) {		/* compute total_len */
415		total_len = 0;
416		for (i = 0; i < nds; i++)
417			total_len += sgl[i].ds_len;
418		if (total_len > max_inline_bytes)
419			max_inline_bytes = -1;	/* too big, do not "inline" */
420	}
421	if (max_inline_bytes != -1) {		/* do "inline" */
422		uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1);
423		*(uint32_t *)ds =
424		    HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK);
425		for (i = 0; i < nds; i++) {
426			if ((len = sgl[i].ds_len) == 0) {
427				continue;
428			}
429			(void) dapl_os_memcpy(dst,
430			    (void *)(uintptr_t)sgl[i].ds_va, len);
431			dst += len;
432		}
433		/* Return the size of descriptor (in 16-byte chunks) */
434		*size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
435	} else {
436		for (i = 0; i < nds; i++) {
437			if (sgl[i].ds_len == 0) {
438				continue;
439			}
440
441			/*
442			 * Fill in the Data Segment(s) for the current WQE,
443			 * using the information contained in the
444			 * scatter-gather list of the work request.
445			 */
446			TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
447			num_ds++;
448		}
449
450		/* Return the size of descriptor (in 16-byte chunks) */
451		*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
452	}
453
454	return (DAT_SUCCESS);
455}
456
457/*
458 * dapli_tavor_wqe_send_linknext()
459 * Takes a WQE and links it to the prev WQE chain
460 */
461static void
462dapli_tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, uint64_t *curr_addr,
463    boolean_t ns, uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr,
464    tavor_sw_wqe_dbinfo_t *dbinfo)
465{
466	uint64_t	next, ctrl;
467	uint32_t	nopcode, fence;
468
469	next = 0;
470	ctrl = 0;
471
472	/* Set the "c" (i.e. "signaled") bit appropriately */
473	if (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
474		ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
475	}
476
477	/* Set the "s" (i.e. "solicited") bit appropriately */
478	if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
479		ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
480	}
481	/* Set the "e" (i.e. "event") bit if notification is needed */
482	if (!ns) {
483		ctrl = ctrl | TAVOR_WQE_RCV_EVENT_MASK;
484	}
485
486	/*
487	 * The "i" bit is unused since uDAPL doesn't support
488	 * the immediate data
489	 */
490
491	/* initialize the ctrl and next fields of the current descriptor */
492	TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
493
494	/*
495	 * Calculate the "next" field of the prev descriptor.  This amounts
496	 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
497	 * fields (see tavor_hw.h for more).
498	 */
499
500	/*
501	 * Determine the value for the Tavor WQE "nopcode" field
502	 * by using the IBTF opcode from the work request
503	 */
504	switch (curr_wr->wr_opcode) {
505	case IBT_WRC_RDMAW:
506		nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
507		break;
508
509	case IBT_WRC_SEND:
510		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
511		break;
512
513	case IBT_WRC_RDMAR:
514		nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
515		break;
516
517	case IBT_WRC_BIND:
518		nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
519		break;
520	default:
521		/* Unsupported opcodes in UDAPL */
522		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
523		    "dapli_tavor_wqe_send_linknext: invalid nopcode=%d\n",
524		    nopcode);
525		return;
526	}
527
528	next  = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
529	next  = next | ((uint64_t)nopcode << 32);
530	fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
531	if (fence) {
532		next = next | TAVOR_WQE_SEND_FENCE_MASK;
533	}
534	next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
535
536	/*
537	 * A send queue doorbell will be rung for the next
538	 * WQE on the chain, set the current WQE's "dbd" bit.
539	 * Note: We also update the "dbinfo" structure here to pass
540	 * back information about what should (later) be included
541	 * in the send queue doorbell.
542	 */
543	next = next | TAVOR_WQE_DBD_MASK;
544	dbinfo->db_nopcode = nopcode;
545	dbinfo->db_fence   = fence;
546
547	/*
548	 * Send queue doorbell will be rung for the next WQE on
549	 * the chain, update the prev WQE's "next" field and return.
550	 */
551	if (prev_addr != NULL) {
552		TAVOR_WQE_LINKFIRST(prev_addr, next);
553	}
554}
555
556
557/*
558 * dapli_tavor_wqe_recv_build()
559 * Builds the recv WQE for a given ibt_recv_wr_t
560 */
561static DAT_RETURN
562dapli_tavor_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
563    uint64_t *addr, uint_t *size)
564{
565	tavor_hw_wqe_sgl_t	*ds;
566	int			i;
567	int			num_ds;
568
569	/* Fill in the Data Segments (SGL) for the Recv WQE */
570	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
571	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
572	num_ds = 0;
573
574	/* Check for valid number of SGL entries */
575	if (wr->wr_nds > qp->qp_rq_sgl) {
576		return (DAT_INVALID_PARAMETER);
577	}
578
579	/*
580	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
581	 * segments.  Note: We skip any SGL with zero size because Tavor
582	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
583	 * the encoding for zero means a 2GB transfer.  Because of this special
584	 * encoding in the hardware, we mask the requested length with
585	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
586	 * zero.)
587	 */
588	for (i = 0; i < wr->wr_nds; i++) {
589		if (wr->wr_sgl[i].ds_len == 0) {
590			continue;
591		}
592
593		/*
594		 * Fill in the Data Segment(s) for the receive WQE, using the
595		 * information contained in the scatter-gather list of the
596		 * work request.
597		 */
598		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
599		num_ds++;
600	}
601
602	/* Return the size of descriptor (in 16-byte chunks) */
603	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 0x4;
604
605	return (DAT_SUCCESS);
606}
607
608
609/*
610 * dapli_tavor_wqe_recv_linknext()
611 * Links a recv WQE to the prev chain
612 */
613static void
614dapli_tavor_wqe_recv_linknext(uint64_t *curr_addr, boolean_t ns,
615    uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr)
616{
617	uint64_t	next;
618	uint64_t	ctrl = 0;
619
620	/*
621	 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
622	 * at a time. If there is no next descriptor (i.e. if the current
623	 * descriptor is the last WQE on the chain), then set "next" field
624	 * to TAVOR_WQE_DBD_MASK.  This is because the Tavor hardware
625	 * requires the "dbd" bit to be set to one for all Recv WQEs.
626	 * In either case, we must add a single bit in the "reserved" field
627	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
628	 * workaround for a known Tavor errata that can cause Recv WQEs with
629	 * zero in the NDA field to behave improperly.
630	 *
631	 * If notification suppression is not desired then we set
632	 * the "E" bit in the ctrl field.
633	 */
634
635	next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
636	if (!ns) { /* notification needed - so set the "E" bit */
637		ctrl = TAVOR_WQE_RCV_EVENT_MASK;
638	}
639
640	/* update the WQE */
641	TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
642
643	if (prev_addr != NULL) {
644		/*
645		 * Calculate the "next" field of the descriptor.  This amounts
646		 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
647		 * (see tavor_hw.h for more).
648		 */
649		next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
650		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
651		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
652
653		/*
654		 * If this WQE is supposed to be linked to the previous
655		 * descriptor, then we need to update not only the previous
656		 * WQE's "next" fields but we must not touch this WQE's
657		 * "ctrl" fields.
658		 */
659		TAVOR_WQE_LINKFIRST(prev_addr, next);
660	}
661}
662
663/*
664 * dapli_tavor_wqe_srq_build()
665 * Builds the recv WQE for a given ibt_recv_wr_t
666 */
667static DAT_RETURN
668dapli_tavor_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
669    uint64_t *addr)
670{
671	tavor_hw_wqe_sgl_t	*ds;
672	ibt_wr_ds_t		end_sgl;
673	int			i;
674	int			num_ds;
675
676	/* Fill in the Data Segments (SGL) for the Recv WQE */
677	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
678	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
679	num_ds = 0;
680
681	/* Check for valid number of SGL entries */
682	if (wr->wr_nds > srq->srq_wq_sgl) {
683		return (DAT_INVALID_PARAMETER);
684	}
685
686	/*
687	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
688	 * segments.  Note: We skip any SGL with zero size because Tavor
689	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
690	 * the encoding for zero means a 2GB transfer.  Because of this special
691	 * encoding in the hardware, we mask the requested length with
692	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
693	 * zero.)
694	 */
695	for (i = 0; i < wr->wr_nds; i++) {
696		if (wr->wr_sgl[i].ds_len == 0) {
697			continue;
698		}
699
700		/*
701		 * Fill in the Data Segment(s) for the receive WQE, using the
702		 * information contained in the scatter-gather list of the
703		 * work request.
704		 */
705		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
706		num_ds++;
707	}
708
709	/*
710	 * For SRQ, if the number of data segments is less than the maximum
711	 * specified at alloc, then we have to fill in a special "key" entry in
712	 * the sgl entry after the last valid one in this post request.  We do
713	 * that here.
714	 */
715	if (num_ds < srq->srq_wq_sgl) {
716		end_sgl.ds_va  = (ib_vaddr_t)0;
717		end_sgl.ds_len = (ib_msglen_t)0;
718		end_sgl.ds_key = (ibt_lkey_t)1;
719		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
720	}
721
722	return (DAT_SUCCESS);
723}
724
725/*
726 * dapli_tavor_wqe_srq_linknext()
727 * Links a srq recv WQE to the prev chain
728 */
729static void
730dapli_tavor_wqe_srq_linknext(uint64_t *curr_addr, boolean_t ns,
731    uint32_t curr_desc, uint64_t *prev_addr)
732{
733	uint64_t	next;
734	uint64_t	ctrl = 0;
735
736	/*
737	 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
738	 * at a time. If there is no next descriptor (i.e. if the current
739	 * descriptor is the last WQE on the chain), then set "next" field
740	 * to TAVOR_WQE_DBD_MASK.  This is because the Tavor hardware
741	 * requires the "dbd" bit to be set to one for all Recv WQEs.
742	 * In either case, we must add a single bit in the "reserved" field
743	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
744	 * workaround for a known Tavor errata that can cause Recv WQEs with
745	 * zero in the NDA field to behave improperly.
746	 *
747	 * If notification suppression is not desired then we set
748	 * the "E" bit in the ctrl field.
749	 */
750
751	next = TAVOR_RCV_WQE_NDA0_WA_MASK;
752	if (!ns) { /* notification needed - so set the "E" bit */
753		ctrl = TAVOR_WQE_RCV_EVENT_MASK;
754	}
755
756	/* update the WQE */
757	TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
758
759	if (prev_addr != NULL) {
760		/*
761		 * Calculate the "next" field of the descriptor.  This amounts
762		 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
763		 * (see tavor_hw.h for more).
764		 */
765		next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
766		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
767
768		/*
769		 * If this WQE is supposed to be linked to the previous
770		 * descriptor, then we need to update not only the previous
771		 * WQE's "next" fields but we must not touch this WQE's
772		 * "ctrl" fields.
773		 */
774		TAVOR_WQE_LINKFIRST(prev_addr, next);
775	}
776}
777
778/*
779 * dapli_tavor_cq_peek()
780 * Peeks into a given CQ to check if there are any events that can be
781 * polled. It returns the number of CQEs that can be polled.
782 */
783static void
784dapli_tavor_cq_peek(ib_cq_handle_t cq, int *num_cqe)
785{
786	tavor_hw_cqe_t		*cqe;
787	uint32_t		imm_eth_pkey_cred;
788	uint32_t		cons_indx;
789	uint32_t		wrap_around_mask;
790	uint32_t		polled_cnt;
791	uint_t			doorbell_cnt;
792	uint_t			opcode;
793
794	/* Get the consumer index */
795	cons_indx = cq->cq_consindx;
796
797	/*
798	 * Calculate the wrap around mask.  Note: This operation only works
799	 * because all Tavor completion queues have power-of-2 sizes
800	 */
801	wrap_around_mask = (cq->cq_size - 1);
802
803	/* Calculate the pointer to the first CQ entry */
804	cqe = &cq->cq_addr[cons_indx];
805
806	/*
807	 * Count entries in the CQ until we find an entry owned by
808	 * the hardware.
809	 */
810	polled_cnt = 0;
811	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
812		opcode = TAVOR_CQE_OPCODE_GET(cqe);
813		/* Error CQE map to multiple work completions */
814		if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
815		    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
816			imm_eth_pkey_cred =
817			    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
818			doorbell_cnt =
819			    imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
820			polled_cnt += (doorbell_cnt + 1);
821		} else {
822			polled_cnt++;
823		}
824		/* Increment the consumer index */
825		cons_indx = (cons_indx + 1) & wrap_around_mask;
826
827		/* Update the pointer to the next CQ entry */
828		cqe = &cq->cq_addr[cons_indx];
829	}
830
831	*num_cqe = polled_cnt;
832}
833
834/*
835 * dapli_tavor_cq_poll()
836 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
837 * array that is passed in.
838 */
839static DAT_RETURN
840dapli_tavor_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
841    uint_t *num_polled)
842{
843	tavor_hw_cqe_t		*cqe;
844	uint32_t		cons_indx;
845	uint32_t		wrap_around_mask;
846	uint32_t		polled_cnt;
847	uint32_t		num_to_increment;
848	DAT_RETURN		dat_status;
849	int			status;
850
851	/* Get the consumer index */
852	cons_indx = cq->cq_consindx;
853
854	/*
855	 * Calculate the wrap around mask.  Note: This operation only works
856	 * because all Tavor completion queues have power-of-2 sizes
857	 */
858	wrap_around_mask = (cq->cq_size - 1);
859
860	/* Calculate the pointer to the first CQ entry */
861	cqe = &cq->cq_addr[cons_indx];
862
863	/*
864	 * Keep pulling entries from the CQ until we find an entry owned by
865	 * the hardware.  As long as there the CQE's owned by SW, process
866	 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
867	 * CQ consumer index.  Note:  We only update the consumer index if
868	 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
869	 * Otherwise, it indicates that we are going to "recycle" the CQE
870	 * (probably because it is a error CQE and corresponds to more than one
871	 * completion).
872	 */
873	polled_cnt = 0;
874	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
875		status = dapli_tavor_cq_cqe_consume(cq, cqe,
876		    &wc_p[polled_cnt++]);
877		if (status == TAVOR_CQ_SYNC_AND_DB) {
878			/* Reset entry to hardware ownership */
879			TAVOR_CQE_OWNER_SET_HW(cqe);
880
881			/* Increment the consumer index */
882			cons_indx = (cons_indx + 1) & wrap_around_mask;
883
884			/* Update the pointer to the next CQ entry */
885			cqe = &cq->cq_addr[cons_indx];
886		}
887
888		/*
889		 * If we have run out of space to store work completions,
890		 * then stop and return the ones we have pulled of the CQ.
891		 */
892		if (polled_cnt >= num_wc) {
893			break;
894		}
895	}
896
897	dat_status = DAT_SUCCESS;
898	/*
899	 * Now we only ring the doorbell (to update the consumer index) if
900	 * we've actually consumed a CQ entry.  If we have, for example,
901	 * pulled from a CQE that we are still in the process of "recycling"
902	 * for error purposes, then we would not update the consumer index.
903	 */
904	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
905		/*
906		 * Post doorbell to update the consumer index.  Doorbell
907		 * value indicates number of entries consumed (minus 1)
908		 */
909		if (cons_indx > cq->cq_consindx) {
910			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
911		} else {
912			num_to_increment = ((cons_indx + cq->cq_size) -
913			    cq->cq_consindx) - 1;
914		}
915		cq->cq_consindx = cons_indx;
916		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
917		    cq->cq_num, num_to_increment);
918	} else if (polled_cnt == 0) {
919		/*
920		 * If the CQ is empty, we can try to free up some of the WRID
921		 * list containers.
922		 */
923		if (cq->cq_wrid_reap_head)	/* look before leaping */
924			dapls_tavor_wrid_cq_reap(cq);
925		dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
926	}
927
928	if (num_polled != NULL) {
929		*num_polled = polled_cnt;
930	}
931
932	return (dat_status);
933}
934
935/*
936 * dapli_tavor_cq_poll_one()
937 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
938 * that is passed in.  See above for more comments/details.
939 */
940static DAT_RETURN
941dapli_tavor_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
942{
943	tavor_hw_cqe_t		*cqe;
944	uint32_t		cons_indx;
945	DAT_RETURN		dat_status;
946	int			status;
947
948	/* Get the consumer index */
949	cons_indx = cq->cq_consindx;
950
951	/* Calculate the pointer to the first CQ entry */
952	cqe = &cq->cq_addr[cons_indx];
953
954	/*
955	 * Keep pulling entries from the CQ until we find an entry owned by
956	 * the hardware.  As long as there the CQE's owned by SW, process
957	 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
958	 * CQ consumer index.  Note:  We only update the consumer index if
959	 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
960	 * Otherwise, it indicates that we are going to "recycle" the CQE
961	 * (probably because it is a error CQE and corresponds to more than one
962	 * completion).
963	 */
964	if (TAVOR_CQE_OWNER_IS_SW(cqe)) {
965		status = dapli_tavor_cq_cqe_consume(cq, cqe, wc_p);
966		if (status == TAVOR_CQ_SYNC_AND_DB) {
967			/* Reset entry to hardware ownership */
968			TAVOR_CQE_OWNER_SET_HW(cqe);
969
970			/* Increment the consumer index */
971			cq->cq_consindx =
972			    (cons_indx + 1) & (cq->cq_size - 1);
973			dapli_tavor_cq_doorbell(cq->cq_iauar,
974			    TAVOR_CQDB_INCR_CONSINDX,
975			    cq->cq_num, 0);
976		}
977		dat_status = DAT_SUCCESS;
978	} else {
979		if (cq->cq_wrid_reap_head)	/* look before leaping */
980			dapls_tavor_wrid_cq_reap(cq);
981		dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
982	}
983	return (dat_status);
984}
985
986/*
987 * dapli_tavor_cq_cqe_consume()
988 * Converts a given CQE into a ibt_wc_t object
989 */
990static int
991dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
992    ibt_wc_t *wc)
993{
994	uint_t		flags;
995	uint_t		type;
996	uint_t		opcode;
997	int		status;
998
999	/*
1000	 * Determine if this is an "error" CQE by examining "opcode".  If it
1001	 * is an error CQE, then call dapli_tavor_cq_errcqe_consume() and return
1002	 * whatever status it returns.  Otherwise, this is a successful
1003	 * completion.
1004	 */
1005	opcode = TAVOR_CQE_OPCODE_GET(cqe);
1006	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1007	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1008		status = dapli_tavor_cq_errcqe_consume(cqhdl, cqe, wc);
1009		return (status);
1010	}
1011
1012	/*
1013	 * Fetch the Work Request ID using the information in the CQE.
1014	 * See tavor_wr.c for more details.
1015	 */
1016	wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1017	    TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL);
1018	wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1019
1020	/*
1021	 * Parse the CQE opcode to determine completion type.  This will set
1022	 * not only the type of the completion, but also any flags that might
1023	 * be associated with it (e.g. whether immediate data is present).
1024	 */
1025	flags = IBT_WC_NO_FLAGS;
1026	if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
1027
1028		/*
1029		 * Send CQE
1030		 *
1031		 * The following opcodes will not be generated in uDAPL
1032		 * case TAVOR_CQE_SND_RDMAWR_IMM:
1033		 * case TAVOR_CQE_SND_SEND_IMM:
1034		 * case TAVOR_CQE_SND_ATOMIC_CS:
1035		 * case TAVOR_CQE_SND_ATOMIC_FA:
1036		 */
1037		switch (opcode) {
1038		case TAVOR_CQE_SND_RDMAWR:
1039			type = IBT_WRC_RDMAW;
1040			break;
1041
1042		case TAVOR_CQE_SND_SEND:
1043			type = IBT_WRC_SEND;
1044			break;
1045
1046		case TAVOR_CQE_SND_RDMARD:
1047			type = IBT_WRC_RDMAR;
1048			wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1049			break;
1050
1051		case TAVOR_CQE_SND_BIND_MW:
1052			type = IBT_WRC_BIND;
1053			break;
1054
1055		default:
1056			wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1057			return (TAVOR_CQ_SYNC_AND_DB);
1058		}
1059	} else {
1060
1061		/*
1062		 * Receive CQE
1063		 *
1064		 * The following opcodes will not be generated in uDAPL
1065		 *
1066		 * case TAVOR_CQE_RCV_RECV_IMM:
1067		 * case TAVOR_CQE_RCV_RECV_IMM2:
1068		 * case TAVOR_CQE_RCV_RDMAWR_IMM:
1069		 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
1070		 */
1071		switch (opcode & 0x1F) {
1072		case TAVOR_CQE_RCV_RECV:
1073			/* FALLTHROUGH */
1074		case TAVOR_CQE_RCV_RECV2:
1075			type = IBT_WRC_RECV;
1076			wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1077			break;
1078		default:
1079			wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1080			return (TAVOR_CQ_SYNC_AND_DB);
1081		}
1082	}
1083	wc->wc_type = type;
1084	wc->wc_flags = flags;
1085	/* If we got here, completion status must be success */
1086	wc->wc_status = IBT_WC_SUCCESS;
1087
1088	return (TAVOR_CQ_SYNC_AND_DB);
1089}
1090
1091
1092/*
1093 * dapli_tavor_cq_errcqe_consume()
1094 */
1095static int
1096dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
1097    ibt_wc_t *wc)
1098{
1099	dapls_tavor_wrid_entry_t	wre;
1100	uint32_t		next_wqeaddr;
1101	uint32_t		imm_eth_pkey_cred;
1102	uint_t			nextwqesize, dbd;
1103	uint_t			doorbell_cnt, status;
1104	uint_t			opcode = TAVOR_CQE_OPCODE_GET(cqe);
1105
1106	dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
1107	    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1108	    TAVOR_CQE_WQEADDRSZ_GET(cqe));
1109
1110	/*
1111	 * Fetch the Work Request ID using the information in the CQE.
1112	 * See tavor_wr.c for more details.
1113	 */
1114	wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1115	    (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND :
1116	    TAVOR_COMPLETION_RECV, 1, &wre);
1117	wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1118
1119	/*
1120	 * Parse the CQE opcode to determine completion type.  We know that
1121	 * the CQE is an error completion, so we extract only the completion
1122	 * status here.
1123	 */
1124	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
1125	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1126	switch (status) {
1127	case TAVOR_CQE_LOC_LEN_ERR:
1128		status = IBT_WC_LOCAL_LEN_ERR;
1129		break;
1130
1131	case TAVOR_CQE_LOC_OP_ERR:
1132		status = IBT_WC_LOCAL_CHAN_OP_ERR;
1133		break;
1134
1135	case TAVOR_CQE_LOC_PROT_ERR:
1136		status = IBT_WC_LOCAL_PROTECT_ERR;
1137		break;
1138
1139	case TAVOR_CQE_WR_FLUSHED_ERR:
1140		status = IBT_WC_WR_FLUSHED_ERR;
1141		break;
1142
1143	case TAVOR_CQE_MW_BIND_ERR:
1144		status = IBT_WC_MEM_WIN_BIND_ERR;
1145		break;
1146
1147	case TAVOR_CQE_BAD_RESPONSE_ERR:
1148		status = IBT_WC_BAD_RESPONSE_ERR;
1149		break;
1150
1151	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1152		status = IBT_WC_LOCAL_ACCESS_ERR;
1153		break;
1154
1155	case TAVOR_CQE_REM_INV_REQ_ERR:
1156		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1157		break;
1158
1159	case TAVOR_CQE_REM_ACC_ERR:
1160		status = IBT_WC_REMOTE_ACCESS_ERR;
1161		break;
1162
1163	case TAVOR_CQE_REM_OP_ERR:
1164		status = IBT_WC_REMOTE_OP_ERR;
1165		break;
1166
1167	case TAVOR_CQE_TRANS_TO_ERR:
1168		status = IBT_WC_TRANS_TIMEOUT_ERR;
1169		break;
1170
1171	case TAVOR_CQE_RNRNAK_TO_ERR:
1172		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1173		break;
1174
1175	/*
1176	 * The following error codes are not supported in the Tavor driver
1177	 * as they relate only to Reliable Datagram completion statuses:
1178	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1179	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1180	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1181	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1182	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1183	 *    case TAVOR_CQE_LOC_EEC_ERR:
1184	 */
1185
1186	default:
1187		status = IBT_WC_LOCAL_CHAN_OP_ERR;
1188		break;
1189	}
1190	wc->wc_status = status;
1191	wc->wc_type = 0;
1192	/*
1193	 * Now we do all the checking that's necessary to handle completion
1194	 * queue entry "recycling"
1195	 *
1196	 * It is not necessary here to try to sync the WQE as we are only
1197	 * attempting to read from the Work Queue (and hardware does not
1198	 * write to it).
1199	 */
1200
1201	/*
1202	 * We can get doorbell info, WQE address, size for the next WQE
1203	 * from the "wre" (which was filled in above in the call to the
1204	 * tavor_wrid_get_entry() routine)
1205	 */
1206	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1207	next_wqeaddr = wre.wr_wqeaddrsz;
1208	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1209
1210	/*
1211	 * Get the doorbell count from the CQE.  This indicates how many
1212	 * completions this one CQE represents.
1213	 */
1214	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1215
1216	/*
1217	 * Determine if we're ready to consume this CQE yet or not.  If the
1218	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1219	 * is down to zero, then this is the last/only completion represented
1220	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1221	 * current CQE needs to be recycled (see below).
1222	 */
1223	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1224		/*
1225		 * Consume the CQE
1226		 *    Return status to indicate that doorbell and sync may be
1227		 *    necessary.
1228		 */
1229		return (TAVOR_CQ_SYNC_AND_DB);
1230
1231	} else {
1232		/*
1233		 * Recycle the CQE for use in the next PollCQ() call
1234		 *    Decrement the doorbell count, modify the error status,
1235		 *    and update the WQE address and size (to point to the
1236		 *    next WQE on the chain.  Put these update entries back
1237		 *    into the CQE.
1238		 *    Despite the fact that we have updated the CQE, it is not
1239		 *    necessary for us to attempt to sync this entry just yet
1240		 *    as we have not changed the "hardware's view" of the
1241		 *    entry (i.e. we have not modified the "owner" bit - which
1242		 *    is all that the Tavor hardware really cares about.
1243		 */
1244		doorbell_cnt = doorbell_cnt - dbd;
1245		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cqe,
1246		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1247		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1248		TAVOR_CQE_WQEADDRSZ_SET(cqe,
1249		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1250		dapl_dbg_log(DAPL_DBG_TYPE_EVD,
1251		    "errcqe_consume: recycling cqe.eth=%x, wqe=%x\n",
1252		    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1253		    TAVOR_CQE_WQEADDRSZ_GET(cqe));
1254		return (TAVOR_CQ_RECYCLE_ENTRY);
1255	}
1256}
1257
1258/*
1259 * dapli_tavor_cq_notify()
1260 * This function is used for arming the CQ by ringing the CQ doorbell.
1261 */
1262static DAT_RETURN
1263dapli_tavor_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
1264{
1265	uint32_t	cqnum;
1266
1267	/*
1268	 * Determine if we are trying to get the next completion or the next
1269	 * "solicited" completion.  Then hit the appropriate doorbell.
1270	 */
1271	cqnum = cq->cq_num;
1272	if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1273		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_CQ,
1274		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
1275
1276	} else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1277		dapli_tavor_cq_doorbell(cq->cq_iauar,
1278		    TAVOR_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1279		    TAVOR_CQDB_DEFAULT_PARAM);
1280
1281	} else if (flags == IB_NOTIFY_ON_NEXT_NCOMP) {
1282		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_NCQ,
1283		    cqnum, param);
1284	} else {
1285		return (DAT_INVALID_PARAMETER);
1286	}
1287
1288	return (DAT_SUCCESS);
1289}
1290
1291/*
1292 * dapli_tavor_post_send()
1293 */
1294static DAT_RETURN
1295dapli_tavor_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1296{
1297	tavor_sw_wqe_dbinfo_t		dbinfo;
1298	dapls_tavor_wrid_list_hdr_t	*wridlist;
1299	dapls_tavor_wrid_entry_t	*wre_last;
1300	uint32_t			desc;
1301	uint64_t			*wqe_addr;
1302	uint32_t			desc_sz;
1303	uint32_t			wqeaddrsz, signaled_dbd;
1304	uint32_t			head, tail, next_tail, qsize_msk;
1305	int				status;
1306	ib_qp_handle_t			qp;
1307
1308	if ((ep->qp_state == IBT_STATE_RESET) ||
1309	    (ep->qp_state == IBT_STATE_INIT) ||
1310	    (ep->qp_state == IBT_STATE_RTR)) {
1311		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1312		    "post_send: invalid qp_state %d\n", ep->qp_state);
1313		return (DAT_INVALID_STATE);
1314	}
1315
1316	qp = ep->qp_handle;
1317
1318	/* Grab the lock for the WRID list */
1319	dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1320	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
1321
1322	/* Save away some initial QP state */
1323	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1324	tail	  = qp->qp_sq_wqhdr->wq_tail;
1325	head	  = qp->qp_sq_wqhdr->wq_head;
1326
1327	/*
1328	 * Check for "queue full" condition.  If the queue is already full,
1329	 * then no more WQEs can be posted, return an error
1330	 */
1331	if (qp->qp_sq_wqhdr->wq_full != 0) {
1332		dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1333		return (DAT_INSUFFICIENT_RESOURCES);
1334	}
1335
1336	/*
1337	 * Increment the "tail index" and check for "queue full" condition.
1338	 * If we detect that the current work request is going to fill the
1339	 * work queue, then we mark this condition and continue.
1340	 */
1341	next_tail = (tail + 1) & qsize_msk;
1342	if (next_tail == head) {
1343		qp->qp_sq_wqhdr->wq_full = 1;
1344	}
1345
1346	/*
1347	 * Get the user virtual address of the location where the next
1348	 * Send WQE should be built
1349	 */
1350	wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1351
1352	/*
1353	 * Call tavor_wqe_send_build() to build the WQE at the given address.
1354	 * This routine uses the information in the ibt_send_wr_t and
1355	 * returns the size of the WQE when it returns.
1356	 */
1357	status = dapli_tavor_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1358	if (status != DAT_SUCCESS) {
1359		dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1360		return (status);
1361	}
1362
1363	/*
1364	 * Get the descriptor (io address) corresponding to the location
1365	 * Send WQE was built.
1366	 */
1367	desc = TAVOR_QP_SQ_DESC(qp, tail);
1368
1369	dapl_os_assert(desc >= qp->qp_sq_desc_addr &&
1370	    desc <= (qp->qp_sq_desc_addr +
1371	    qp->qp_sq_numwqe*qp->qp_sq_wqesz));
1372
1373	/*
1374	 * Add a WRID entry to the WRID list.  Need to calculate the
1375	 * "wqeaddrsz" and "signaled_dbd" values to pass to
1376	 * dapli_tavor_wrid_add_entry()
1377	 */
1378	wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1379
1380	if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1381		signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1382	}
1383
1384	dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1385	    signaled_dbd);
1386
1387	/*
1388	 * Now link the wqe to the old chain (if there was one)
1389	 */
1390	dapli_tavor_wqe_send_linknext(wr, wqe_addr, ns, desc, desc_sz,
1391	    qp->qp_sq_lastwqeaddr, &dbinfo);
1392
1393	/*
1394	 * Now if the WRID tail entry is non-NULL, then this
1395	 * represents the entry to which we are chaining the
1396	 * new entries.  Since we are going to ring the
1397	 * doorbell for this WQE, we want set its "dbd" bit.
1398	 *
1399	 * On the other hand, if the tail is NULL, even though
1400	 * we will have rung the doorbell for the previous WQE
1401	 * (for the hardware's sake) it is irrelevant to our
1402	 * purposes (for tracking WRIDs) because we know the
1403	 * request must have already completed.
1404	 */
1405	wre_last = wridlist->wl_wre_old_tail;
1406	if (wre_last != NULL) {
1407		wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1408	}
1409
1410	/* Update some of the state in the QP */
1411	qp->qp_sq_lastwqeaddr	 = wqe_addr;
1412	qp->qp_sq_wqhdr->wq_tail = next_tail;
1413
1414	/* Ring the doorbell */
1415	dapli_tavor_qp_send_doorbell(qp->qp_iauar, desc, desc_sz,
1416	    qp->qp_num, dbinfo.db_fence, dbinfo.db_nopcode);
1417
1418	dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1419
1420	return (DAT_SUCCESS);
1421}
1422
1423/*
1424 * dapli_tavor_post_recv()
1425 */
1426static DAT_RETURN
1427dapli_tavor_post_recv(DAPL_EP	*ep, ibt_recv_wr_t *wr, boolean_t ns)
1428{
1429	dapls_tavor_wrid_list_hdr_t	*wridlist;
1430	dapls_tavor_wrid_entry_t	*wre_last;
1431	ib_qp_handle_t			qp;
1432	DAT_RETURN			status;
1433	uint32_t			desc;
1434	uint64_t			*wqe_addr;
1435	uint32_t			desc_sz;
1436	uint32_t			wqeaddrsz;
1437	uint32_t			head, tail, next_tail, qsize_msk;
1438
1439	if (ep->qp_state == IBT_STATE_RESET) {
1440		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1441		    "post_recv: invalid qp_state %d\n", ep->qp_state);
1442		return (DAT_INVALID_STATE);
1443	}
1444	qp = ep->qp_handle;
1445
1446	/* Grab the lock for the WRID list */
1447	dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1448	wridlist  = qp->qp_rq_wqhdr->wq_wrid_post;
1449
1450	/* Save away some initial QP state */
1451	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1452	tail	  = qp->qp_rq_wqhdr->wq_tail;
1453	head	  = qp->qp_rq_wqhdr->wq_head;
1454
1455	/*
1456	 * For the ibt_recv_wr_t passed in, parse the request and build a
1457	 * Recv WQE. Link the WQE with the previous WQE and ring the
1458	 * door bell.
1459	 */
1460
1461	/*
1462	 * Check for "queue full" condition.  If the queue is already full,
1463	 * then no more WQEs can be posted. So return an error.
1464	 */
1465	if (qp->qp_rq_wqhdr->wq_full != 0) {
1466		dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1467		return (DAT_INSUFFICIENT_RESOURCES);
1468	}
1469
1470	/*
1471	 * Increment the "tail index" and check for "queue
1472	 * full" condition.  If we detect that the current
1473	 * work request is going to fill the work queue, then
1474	 * we mark this condition and continue.
1475	 */
1476	next_tail = (tail + 1) & qsize_msk;
1477	if (next_tail == head) {
1478		qp->qp_rq_wqhdr->wq_full = 1;
1479	}
1480
1481	/* Get the descriptor (IO Address) of the WQE to be built */
1482	desc = TAVOR_QP_RQ_DESC(qp, tail);
1483	/* The user virtual address of the WQE to be built */
1484	wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1485
1486	/*
1487	 * Call tavor_wqe_recv_build() to build the WQE at the given
1488	 * address. This routine uses the information in the
1489	 * ibt_recv_wr_t and returns the size of the WQE.
1490	 */
1491	status = dapli_tavor_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1492	if (status != DAT_SUCCESS) {
1493		dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1494		return (DAT_INTERNAL_ERROR);
1495	}
1496
1497	/*
1498	 * Add a WRID entry to the WRID list.  Need to calculate the
1499	 * "wqeaddrsz" and "signaled_dbd" values to pass to
1500	 * dapli_tavor_wrid_add_entry().
1501	 * Note: all Recv WQEs are essentially "signaled"
1502	 */
1503	wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1504	dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1505	    (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1506
1507	/*
1508	 * Now link the chain to the old chain (if there was one)
1509	 * and ring the doorbel for the recv work queue.
1510	 */
1511	dapli_tavor_wqe_recv_linknext(wqe_addr, ns, desc, desc_sz,
1512	    qp->qp_rq_lastwqeaddr);
1513
1514	/*
1515	 * Now if the WRID tail entry is non-NULL, then this
1516	 * represents the entry to which we are chaining the
1517	 * new entries.  Since we are going to ring the
1518	 * doorbell for this WQE, we want set its "dbd" bit.
1519	 *
1520	 * On the other hand, if the tail is NULL, even though
1521	 * we will have rung the doorbell for the previous WQE
1522	 * (for the hardware's sake) it is irrelevant to our
1523	 * purposes (for tracking WRIDs) because we know the
1524	 * request must have already completed.
1525	 */
1526	wre_last = wridlist->wl_wre_old_tail;
1527	if (wre_last != NULL) {
1528		wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1529	}
1530
1531	/* Update some of the state in the QP */
1532	qp->qp_rq_lastwqeaddr	 = wqe_addr;
1533	qp->qp_rq_wqhdr->wq_tail = next_tail;
1534
1535	/* Ring the doorbell */
1536	dapli_tavor_qp_recv_doorbell(qp->qp_iauar, desc, desc_sz,
1537	    qp->qp_num, 1);
1538
1539	dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1540
1541	return (DAT_SUCCESS);
1542}
1543
1544/*
1545 * dapli_tavor_post_srq()
1546 */
1547static DAT_RETURN
1548dapli_tavor_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1549{
1550	ib_srq_handle_t			srq;
1551	DAT_RETURN			status;
1552	uint32_t			desc;
1553	uint64_t			*wqe_addr;
1554	uint64_t			*last_wqe_addr;
1555	uint32_t			head, next_head, qsize_msk;
1556	uint32_t			wqe_index;
1557
1558
1559	srq = srqp->srq_handle;
1560
1561	/* Grab the lock for the WRID list */
1562	dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1563
1564	/*
1565	 * For the ibt_recv_wr_t passed in, parse the request and build a
1566	 * Recv WQE. Link the WQE with the previous WQE and ring the
1567	 * door bell.
1568	 */
1569
1570	/*
1571	 * Check for "queue full" condition.  If the queue is already full,
1572	 * ie. there are no free entries, then no more WQEs can be posted.
1573	 * So return an error.
1574	 */
1575	if (srq->srq_wridlist->wl_freel_entries == 0) {
1576		dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1577		return (DAT_INSUFFICIENT_RESOURCES);
1578	}
1579
1580	/* Save away some initial SRQ state */
1581	qsize_msk = srq->srq_wridlist->wl_size - 1;
1582	head	  = srq->srq_wridlist->wl_freel_head;
1583
1584	next_head = (head + 1) & qsize_msk;
1585
1586	/* Get the descriptor (IO Address) of the WQE to be built */
1587	desc = srq->srq_wridlist->wl_free_list[head];
1588
1589	wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1590	    srq->srq_wq_wqesz);
1591
1592	/* The user virtual address of the WQE to be built */
1593	wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1594
1595	/*
1596	 * Call dapli_tavor_wqe_srq_build() to build the WQE at the given
1597	 * address. This routine uses the information in the
1598	 * ibt_recv_wr_t and returns the size of the WQE.
1599	 */
1600	status = dapli_tavor_wqe_srq_build(srq, wr, wqe_addr);
1601	if (status != DAT_SUCCESS) {
1602		dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1603		return (status);
1604	}
1605
1606	/*
1607	 * Add a WRID entry to the WRID list.
1608	 */
1609	dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1610
1611	if (srq->srq_wq_lastwqeindex == -1) {
1612		last_wqe_addr = NULL;
1613	} else {
1614		last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1615		    srq->srq_wq_lastwqeindex);
1616	}
1617	/*
1618	 * Now link the chain to the old chain (if there was one)
1619	 * and ring the doorbell for the SRQ.
1620	 */
1621	dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1622
1623	/* Update some of the state in the SRQ */
1624	srq->srq_wq_lastwqeindex	 = wqe_index;
1625	srq->srq_wridlist->wl_freel_head = next_head;
1626	srq->srq_wridlist->wl_freel_entries--;
1627	dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1628	    srq->srq_wridlist->wl_size);
1629
1630	/* Ring the doorbell - for SRQ nds = 0 */
1631	dapli_tavor_qp_recv_doorbell(srq->srq_iauar, desc, 0,
1632	    srq->srq_num, 1);
1633
1634	dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1635
1636	return (DAT_SUCCESS);
1637}
1638
1639/*
1640 * dapli_tavor_wrid_add_entry()
1641 */
1642extern void
1643dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *wq, uint64_t wrid,
1644    uint32_t wqeaddrsz, uint_t signaled_dbd)
1645{
1646	dapls_tavor_wrid_entry_t	*wre_tmp;
1647	uint32_t			head, tail, size;
1648
1649	/*
1650	 * Find the entry in the container pointed to by the "tail" index.
1651	 * Add all of the relevant information to that entry, including WRID,
1652	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
1653	 * and/or doorbelled.
1654	 */
1655	head = wq->wq_wrid_post->wl_head;
1656	tail = wq->wq_wrid_post->wl_tail;
1657	size = wq->wq_wrid_post->wl_size;
1658	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
1659	wre_tmp->wr_wrid	  = wrid;
1660	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
1661	wre_tmp->wr_signaled_dbd  = signaled_dbd;
1662
1663	/*
1664	 * Update the "wrid_old_tail" pointer to point to the entry we just
1665	 * inserted into the queue.  By tracking this pointer (the pointer to
1666	 * the most recently inserted entry) it will possible later in the
1667	 * PostSend() and PostRecv() code paths to find the entry that needs
1668	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
1669	 * tavor_post_send()).
1670	 */
1671	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
1672
1673	/* Update the tail index */
1674	tail = ((tail + 1) & (size - 1));
1675	wq->wq_wrid_post->wl_tail = tail;
1676
1677	/*
1678	 * If the "tail" index has just wrapped over into the "head" index,
1679	 * then we have filled the container.  We use the "full" flag to
1680	 * indicate this condition and to distinguish it from the "empty"
1681	 * condition (where head and tail are also equal).
1682	 */
1683	if (head == tail) {
1684		wq->wq_wrid_post->wl_full = 1;
1685	}
1686}
1687
1688/*
1689 * dapli_tavor_wrid_add_entry_srq()
1690 */
1691extern void
1692dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq, uint64_t wrid,
1693    uint32_t wqe_index)
1694{
1695	dapls_tavor_wrid_entry_t	*wre;
1696
1697	/* ASSERT on impossible wqe_index values */
1698	dapl_os_assert(wqe_index < srq->srq_wq_numwqe);
1699
1700	/*
1701	 * Setup the WRE.
1702	 *
1703	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
1704	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
1705	 * this information and associate the WRID to the WQE found on the CQE.
1706	 * Note: all Recv WQEs are essentially "signaled"
1707	 */
1708	wre = &srq->srq_wridlist->wl_wre[wqe_index];
1709	wre->wr_wrid = wrid;
1710	wre->wr_signaled_dbd = (uint32_t)TAVOR_WRID_ENTRY_SIGNALED;
1711}
1712
1713/*
1714 * dapli_tavor_cq_srq_entries_flush()
1715 */
1716static void
1717dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)
1718{
1719	ib_cq_handle_t		cq;
1720	dapls_tavor_workq_hdr_t	*wqhdr;
1721	tavor_hw_cqe_t		*cqe;
1722	tavor_hw_cqe_t		*next_cqe;
1723	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1724	uint32_t		new_indx, check_indx, indx;
1725	uint32_t		num_to_increment;
1726	int			cqe_qpnum, cqe_type;
1727	int			outstanding_cqes, removed_cqes;
1728	int			i;
1729
1730	/* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1731
1732	cq = qp->qp_rq_cqhdl;
1733	wqhdr = qp->qp_rq_wqhdr;
1734
1735	dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1736	dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1737
1738	/* Get the consumer index */
1739	cons_indx = cq->cq_consindx;
1740
1741	/*
1742	 * Calculate the wrap around mask.  Note: This operation only works
1743	 * because all Tavor completion queues have power-of-2 sizes
1744	 */
1745	wrap_around_mask = (cq->cq_size - 1);
1746
1747	/* Calculate the pointer to the first CQ entry */
1748	cqe = &cq->cq_addr[cons_indx];
1749
1750	/*
1751	 * Loop through the CQ looking for entries owned by software.  If an
1752	 * entry is owned by software then we increment an 'outstanding_cqes'
1753	 * count to know how many entries total we have on our CQ.  We use this
1754	 * value further down to know how many entries to loop through looking
1755	 * for our same QP number.
1756	 */
1757	outstanding_cqes = 0;
1758	tail_cons_indx = cons_indx;
1759	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1760		/* increment total cqes count */
1761		outstanding_cqes++;
1762
1763		/* increment the consumer index */
1764		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1765
1766		/* update the pointer to the next cq entry */
1767		cqe = &cq->cq_addr[tail_cons_indx];
1768	}
1769
1770	/*
1771	 * Using the 'tail_cons_indx' that was just set, we now know how many
1772	 * total CQEs possible there are.  Set the 'check_indx' and the
1773	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1774	 */
1775	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1776
1777	for (i = 0; i < outstanding_cqes; i++) {
1778		cqe = &cq->cq_addr[check_indx];
1779
1780		/* Grab QP number from CQE */
1781		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1782		cqe_type = TAVOR_CQE_SENDRECV_GET(cqe);
1783
1784		/*
1785		 * If the QP number is the same in the CQE as the QP that we
1786		 * have on this SRQ, then we must free up the entry off the
1787		 * SRQ.  We also make sure that the completion type is of the
1788		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1789		 * this CQ will be left as-is.  The handling of returning
1790		 * entries back to HW ownership happens further down.
1791		 */
1792		if (cqe_qpnum == qp->qp_num &&
1793		    cqe_type == TAVOR_COMPLETION_RECV) {
1794			/* Add back to SRQ free list */
1795			(void) dapli_tavor_wrid_find_match_srq(
1796			    wqhdr->wq_wrid_post, cqe);
1797		} else {
1798			/* Do Copy */
1799			if (check_indx != new_indx) {
1800				next_cqe = &cq->cq_addr[new_indx];
1801				/*
1802				 * Copy the CQE into the "next_cqe"
1803				 * pointer.
1804				 */
1805				(void) dapl_os_memcpy(next_cqe, cqe,
1806				    sizeof (tavor_hw_cqe_t));
1807			}
1808			new_indx = (new_indx - 1) & wrap_around_mask;
1809		}
1810		/* Move index to next CQE to check */
1811		check_indx = (check_indx - 1) & wrap_around_mask;
1812	}
1813
1814	/* Initialize removed cqes count */
1815	removed_cqes = 0;
1816
1817	/* If an entry was removed */
1818	if (check_indx != new_indx) {
1819
1820		/*
1821		 * Set current pointer back to the beginning consumer index.
1822		 * At this point, all unclaimed entries have been copied to the
1823		 * index specified by 'new_indx'.  This 'new_indx' will be used
1824		 * as the new consumer index after we mark all freed entries as
1825		 * having HW ownership.  We do that here.
1826		 */
1827
1828		/* Loop through all entries until we reach our new pointer */
1829		for (indx = cons_indx; indx <= new_indx;
1830		    indx = (indx + 1) & wrap_around_mask) {
1831			removed_cqes++;
1832			cqe = &cq->cq_addr[indx];
1833
1834			/* Reset entry to hardware ownership */
1835			TAVOR_CQE_OWNER_SET_HW(cqe);
1836		}
1837	}
1838
1839	/*
1840	 * Update consumer index to be the 'new_indx'.  This moves it past all
1841	 * removed entries.  Because 'new_indx' is pointing to the last
1842	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1843	 * the first HW owned entry.
1844	 */
1845	cons_indx = (new_indx + 1) & wrap_around_mask;
1846
1847	/*
1848	 * Now we only ring the doorbell (to update the consumer index) if
1849	 * we've actually consumed a CQ entry.  If we found no QP number
1850	 * matches above, then we would not have removed anything.  So only if
1851	 * something was removed do we ring the doorbell.
1852	 */
1853	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1854		/*
1855		 * Post doorbell to update the consumer index.  Doorbell
1856		 * value indicates number of entries consumed (minus 1)
1857		 */
1858		if (cons_indx > cq->cq_consindx) {
1859			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1860		} else {
1861			num_to_increment = ((cons_indx + cq->cq_size) -
1862			    cq->cq_consindx) - 1;
1863		}
1864		cq->cq_consindx = cons_indx;
1865
1866		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
1867		    cq->cq_num, num_to_increment);
1868	}
1869}
1870
1871/* ARGSUSED */
1872static void
1873dapli_tavor_qp_init(ib_qp_handle_t qp)
1874{
1875}
1876
1877/* ARGSUSED */
1878static void
1879dapli_tavor_cq_init(ib_cq_handle_t cq)
1880{
1881}
1882
1883/* ARGSUSED */
1884static void
1885dapli_tavor_srq_init(ib_srq_handle_t srq)
1886{
1887}
1888
1889void
1890dapls_init_funcs_tavor(DAPL_HCA *hca_ptr)
1891{
1892	hca_ptr->post_send = dapli_tavor_post_send;
1893	hca_ptr->post_recv = dapli_tavor_post_recv;
1894	hca_ptr->post_srq = dapli_tavor_post_srq;
1895	hca_ptr->cq_peek = dapli_tavor_cq_peek;
1896	hca_ptr->cq_poll = dapli_tavor_cq_poll;
1897	hca_ptr->cq_poll_one = dapli_tavor_cq_poll_one;
1898	hca_ptr->cq_notify = dapli_tavor_cq_notify;
1899	hca_ptr->srq_flush = dapli_tavor_cq_srq_entries_flush;
1900	hca_ptr->qp_init = dapli_tavor_qp_init;
1901	hca_ptr->cq_init = dapli_tavor_cq_init;
1902	hca_ptr->srq_init = dapli_tavor_srq_init;
1903	hca_ptr->hermon_resize_cq = 0;
1904}
1905