tavor_wr.c revision 9517:b4839b0aa7a4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * tavor_wr.c
29 *    Tavor Work Request Processing Routines
30 *
31 *    Implements all the routines necessary to provide the PostSend(),
32 *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33 *    necessary to implement the Tavor WRID tracking mechanism.
34 */
35
36#include <sys/types.h>
37#include <sys/conf.h>
38#include <sys/ddi.h>
39#include <sys/sunddi.h>
40#include <sys/modctl.h>
41#include <sys/avl.h>
42
43#include <sys/ib/adapters/tavor/tavor.h>
44
45static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46    uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47#pragma inline(tavor_qp_send_doorbell)
48static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
49    uint32_t nds, uint32_t qpn, uint32_t credits);
50#pragma inline(tavor_qp_recv_doorbell)
51static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
52static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
53static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
54    ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
55static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
56    ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
57    uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
58static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
59    ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
60static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
61    uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
62    tavor_qphdl_t qp);
63static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
64    ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
65static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
66    uint64_t *prev, tavor_qphdl_t qp);
67static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
68    ibt_recv_wr_t *wr, uint64_t *desc);
69static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
70    tavor_srqhdl_t srq);
71static void tavor_wqe_sync(void *hdl, uint_t sync_from,
72    uint_t sync_to, uint_t sync_type, uint_t flag);
73static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
74    tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
75static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
76static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
77    uint_t send_or_recv);
78static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
79    tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
80static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
81static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
82    tavor_wrid_list_hdr_t *wrid_list);
83static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
84    tavor_wrid_list_hdr_t *wrid_list);
85static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
86static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
87static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
88static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
89static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
90
91/*
92 * tavor_post_send()
93 *    Context: Can be called from interrupt or base context.
94 */
95int
96tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
97    ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
98{
99	tavor_sw_wqe_dbinfo_t		dbinfo;
100	tavor_wrid_list_hdr_t		*wridlist;
101	tavor_wrid_entry_t		*wre_last;
102	uint64_t			*desc, *prev, *first;
103	uint32_t			desc_sz, first_sz;
104	uint32_t			wqeaddrsz, signaled_dbd;
105	uint32_t			head, tail, next_tail, qsize_msk;
106	uint32_t			sync_from, sync_to;
107	uint_t				currindx, wrindx, numremain;
108	uint_t				chainlen, chainbegin, posted_cnt;
109	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
110	int				status;
111
112	TAVOR_TNF_ENTER(tavor_post_send);
113
114	/*
115	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
116	 * clients to post to QP memory that is accessible directly by the
117	 * user.  If the QP memory is user accessible, then return an error.
118	 */
119	if (qp->qp_is_umap) {
120		TNF_PROBE_0(tavor_post_send_inv_usrmapped_type,
121		    TAVOR_TNF_ERROR, "");
122		TAVOR_TNF_EXIT(tavor_post_send);
123		return (IBT_QP_HDL_INVALID);
124	}
125
126	/* Initialize posted_cnt */
127	posted_cnt = 0;
128
129	mutex_enter(&qp->qp_lock);
130
131	/*
132	 * Check QP state.  Can not post Send requests from the "Reset",
133	 * "Init", or "RTR" states
134	 */
135	if ((qp->qp_state == TAVOR_QP_RESET) ||
136	    (qp->qp_state == TAVOR_QP_INIT) ||
137	    (qp->qp_state == TAVOR_QP_RTR)) {
138		mutex_exit(&qp->qp_lock);
139		TNF_PROBE_0(tavor_post_send_inv_qpstate_fail,
140		    TAVOR_TNF_ERROR, "");
141		TAVOR_TNF_EXIT(tavor_post_send);
142		return (IBT_QP_STATE_INVALID);
143	}
144
145	/* Grab the lock for the WRID list */
146	mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
147	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
148
149	/* Save away some initial QP state */
150	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
151	tail	  = qp->qp_sq_wqhdr->wq_tail;
152	head	  = qp->qp_sq_wqhdr->wq_head;
153
154	/*
155	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
156	 * request and build a Send WQE.  Note:  Because we are potentially
157	 * building a chain of WQEs, we want to link them all together.
158	 * However, we do not want to link the first one to the previous
159	 * WQE until the entire chain has been linked.  Then in the last
160	 * step we ring the appropriate doorbell.  Note:  It is possible for
161	 * more Work Requests to be posted than the HW will support at one
162	 * shot.  If this happens, we need to be able to post and ring
163	 * several chains here until the the entire request is complete.
164	 */
165	wrindx = 0;
166	numremain = num_wr;
167	status	  = DDI_SUCCESS;
168	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
169		/*
170		 * For the first WQE on a new chain we need "prev" to point
171		 * to the current descriptor.  As we begin to process
172		 * further, "prev" will be updated to point to the previous
173		 * WQE on the current chain (see below).
174		 */
175		prev = TAVOR_QP_SQ_ENTRY(qp, tail);
176
177		/*
178		 * Before we begin, save the current "tail index" for later
179		 * DMA sync
180		 */
181		sync_from = tail;
182
183		/*
184		 * Break the request up into chains that are less than or
185		 * equal to the maximum number of WQEs that can be posted
186		 * per doorbell ring
187		 */
188		chainlen   = (numremain > maxdb) ? maxdb : numremain;
189		numremain -= chainlen;
190		chainbegin = wrindx;
191		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
192			/*
193			 * Check for "queue full" condition.  If the queue
194			 * is already full, then no more WQEs can be posted.
195			 * So break out, ring a doorbell (if necessary) and
196			 * return an error
197			 */
198			if (qp->qp_sq_wqhdr->wq_full != 0) {
199				status = IBT_QP_FULL;
200				TNF_PROBE_0_DEBUG(tavor_post_send_sqfull,
201				    TAVOR_TNF_TRACE, "");
202				break;
203			}
204
205			/*
206			 * Increment the "tail index" and check for "queue
207			 * full" condition.  If we detect that the current
208			 * work request is going to fill the work queue, then
209			 * we mark this condition and continue.
210			 */
211			next_tail = (tail + 1) & qsize_msk;
212			if (next_tail == head) {
213				qp->qp_sq_wqhdr->wq_full = 1;
214			}
215
216			/*
217			 * Get the address of the location where the next
218			 * Send WQE should be built
219			 */
220			desc = TAVOR_QP_SQ_ENTRY(qp, tail);
221
222			/*
223			 * Call tavor_wqe_send_build() to build the WQE
224			 * at the given address.  This routine uses the
225			 * information in the ibt_send_wr_t list (wr[]) and
226			 * returns the size of the WQE when it returns.
227			 */
228			status = tavor_wqe_send_build(state, qp,
229			    &wr[wrindx], desc, &desc_sz);
230			if (status != DDI_SUCCESS) {
231				TNF_PROBE_0(tavor_post_send_bldwqe_fail,
232				    TAVOR_TNF_ERROR, "");
233				break;
234			}
235
236			/*
237			 * Add a WRID entry to the WRID list.  Need to
238			 * calculate the "wqeaddrsz" and "signaled_dbd"
239			 * values to pass to tavor_wrid_add_entry()
240			 */
241			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
242			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
243			    desc_sz);
244			if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
245			    (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
246				signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
247			} else {
248				signaled_dbd = 0;
249			}
250			tavor_wrid_add_entry(qp->qp_sq_wqhdr,
251			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
252
253			/*
254			 * If this is not the first descriptor on the current
255			 * chain, then link it to the previous WQE.  Otherwise,
256			 * save the address and size of this descriptor (in
257			 * "first" and "first_sz" respectively) and continue.
258			 * Note: Linking a WQE to the the previous one will
259			 * depend on whether the two WQEs are from "special
260			 * QPs" (i.e. MLX transport WQEs) or whether they are
261			 * normal Send WQEs.
262			 */
263			if (currindx != 0) {
264				if (qp->qp_is_special) {
265					tavor_wqe_mlx_linknext(&wr[wrindx - 1],
266					    desc, desc_sz, prev, NULL, qp);
267				} else {
268					tavor_wqe_send_linknext(&wr[wrindx],
269					    &wr[wrindx - 1], desc, desc_sz,
270					    prev, NULL, qp);
271				}
272				prev = desc;
273			} else {
274				first	 = desc;
275				first_sz = desc_sz;
276			}
277
278			/*
279			 * Update the current "tail index" and increment
280			 * "posted_cnt"
281			 */
282			tail = next_tail;
283			posted_cnt++;
284		}
285
286		/*
287		 * If we reach here and there are one or more WQEs which have
288		 * been successfully chained together, then we need to link
289		 * the current chain to the previously executing chain of
290		 * descriptor (if there is one) and ring the doorbell for the
291		 * send work queue.
292		 */
293		if (currindx != 0) {
294			/*
295			 * Before we link the chain, we need to ensure that the
296			 * "next" field on the last WQE is set to NULL (to
297			 * indicate the end of the chain).  Note: Just as it
298			 * did above, the format for the "next" fields in a
299			 * given WQE depend on whether the WQE is MLX
300			 * transport or not.
301			 */
302			if (qp->qp_is_special) {
303				tavor_wqe_mlx_linknext(&wr[chainbegin +
304				    currindx - 1], NULL, 0, prev, NULL, qp);
305			} else {
306				tavor_wqe_send_linknext(NULL,
307				    &wr[chainbegin + currindx - 1], NULL, 0,
308				    prev, NULL, qp);
309			}
310
311			/* Save away updated "tail index" for the DMA sync */
312			sync_to = tail;
313
314			/* Do a DMA sync for current send WQE(s) */
315			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
316			    DDI_DMA_SYNC_FORDEV);
317
318			/*
319			 * Now link the chain to the old chain (if there was
320			 * one.  Note: still need to pay attention to whether
321			 * the QP used MLX transport WQEs or not.
322			 */
323			if (qp->qp_is_special) {
324				tavor_wqe_mlx_linknext(NULL, first, first_sz,
325				    qp->qp_sq_lastwqeaddr, &dbinfo, qp);
326			} else {
327				tavor_wqe_send_linknext(&wr[chainbegin], NULL,
328				    first, first_sz, qp->qp_sq_lastwqeaddr,
329				    &dbinfo, qp);
330			}
331
332			/*
333			 * If there was a valid previous WQE (i.e. non-NULL),
334			 * then sync it too.  This is because we have updated
335			 * its "next" fields and we want to ensure that the
336			 * hardware can see the changes.
337			 */
338			if (qp->qp_sq_lastwqeaddr != NULL) {
339				sync_to   = sync_from;
340				sync_from = (sync_from - 1) & qsize_msk;
341				tavor_wqe_sync(qp, sync_from, sync_to,
342				    TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
343			}
344
345			/*
346			 * Now if the WRID tail entry is non-NULL, then this
347			 * represents the entry to which we are chaining the
348			 * new entries.  Since we are going to ring the
349			 * doorbell for this WQE, we want set its "dbd" bit.
350			 *
351			 * On the other hand, if the tail is NULL, even though
352			 * we will have rung the doorbell for the previous WQE
353			 * (for the hardware's sake) it is irrelevant to our
354			 * purposes (for tracking WRIDs) because we know the
355			 * request must have already completed.
356			 */
357			wre_last = wridlist->wl_wre_old_tail;
358			if (wre_last != NULL) {
359				wre_last->wr_signaled_dbd |=
360				    TAVOR_WRID_ENTRY_DOORBELLED;
361			}
362
363			/* Update some of the state in the QP */
364			qp->qp_sq_lastwqeaddr	 = desc;
365			qp->qp_sq_wqhdr->wq_tail = tail;
366
367			/* Ring the doorbell */
368			tavor_qp_send_doorbell(state,
369			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
370			    first_sz, qp->qp_qpnum, dbinfo.db_fence,
371			    dbinfo.db_nopcode);
372		}
373	}
374
375	/*
376	 * Update the "num_posted" return value (if necessary).  Then drop
377	 * the locks and return success.
378	 */
379	if (num_posted != NULL) {
380		*num_posted = posted_cnt;
381	}
382
383	mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
384	mutex_exit(&qp->qp_lock);
385
386	TAVOR_TNF_EXIT(tavor_post_send);
387	return (status);
388}
389
390
391/*
392 * tavor_post_recv()
393 *    Context: Can be called from interrupt or base context.
394 */
395int
396tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
397    ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
398{
399	tavor_wrid_list_hdr_t		*wridlist;
400	tavor_wrid_entry_t		*wre_last;
401	uint64_t			*desc, *prev, *first;
402	uint32_t			desc_sz, first_sz;
403	uint32_t			wqeaddrsz, signaled_dbd;
404	uint32_t			head, tail, next_tail, qsize_msk;
405	uint32_t			sync_from, sync_to;
406	uint_t				currindx, wrindx, numremain;
407	uint_t				chainlen, posted_cnt;
408	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
409	int				status;
410
411	TAVOR_TNF_ENTER(tavor_post_recv);
412
413	/*
414	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
415	 * clients to post to QP memory that is accessible directly by the
416	 * user.  If the QP memory is user accessible, then return an error.
417	 */
418	if (qp->qp_is_umap) {
419		TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type,
420		    TAVOR_TNF_ERROR, "");
421		TAVOR_TNF_EXIT(tavor_post_recv);
422		return (IBT_QP_HDL_INVALID);
423	}
424
425	/* Initialize posted_cnt */
426	posted_cnt = 0;
427
428	mutex_enter(&qp->qp_lock);
429
430	/*
431	 * Check if QP is associated with an SRQ
432	 */
433	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
434		mutex_exit(&qp->qp_lock);
435		TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq,
436		    TAVOR_TNF_ERROR, "");
437		TAVOR_TNF_EXIT(tavor_post_recv);
438		return (IBT_SRQ_IN_USE);
439	}
440
441	/*
442	 * Check QP state.  Can not post Recv requests from the "Reset" state
443	 */
444	if (qp->qp_state == TAVOR_QP_RESET) {
445		mutex_exit(&qp->qp_lock);
446		TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail,
447		    TAVOR_TNF_ERROR, "");
448		TAVOR_TNF_EXIT(tavor_post_recv);
449		return (IBT_QP_STATE_INVALID);
450	}
451
452	/* Grab the lock for the WRID list */
453	mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
454	wridlist  = qp->qp_rq_wqhdr->wq_wrid_post;
455
456	/* Save away some initial QP state */
457	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
458	tail	  = qp->qp_rq_wqhdr->wq_tail;
459	head	  = qp->qp_rq_wqhdr->wq_head;
460
461	/*
462	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
463	 * request and build a Recv WQE.  Note:  Because we are potentially
464	 * building a chain of WQEs, we want to link them all together.
465	 * However, we do not want to link the first one to the previous
466	 * WQE until the entire chain has been linked.  Then in the last
467	 * step we ring the appropriate doorbell.  Note:  It is possible for
468	 * more Work Requests to be posted than the HW will support at one
469	 * shot.  If this happens, we need to be able to post and ring
470	 * several chains here until the the entire request is complete.
471	 */
472	wrindx = 0;
473	numremain = num_wr;
474	status	  = DDI_SUCCESS;
475	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
476		/*
477		 * For the first WQE on a new chain we need "prev" to point
478		 * to the current descriptor.  As we begin to process
479		 * further, "prev" will be updated to point to the previous
480		 * WQE on the current chain (see below).
481		 */
482		prev = TAVOR_QP_RQ_ENTRY(qp, tail);
483
484		/*
485		 * Before we begin, save the current "tail index" for later
486		 * DMA sync
487		 */
488		sync_from = tail;
489
490		/*
491		 * Break the request up into chains that are less than or
492		 * equal to the maximum number of WQEs that can be posted
493		 * per doorbell ring
494		 */
495		chainlen = (numremain > maxdb) ? maxdb : numremain;
496		numremain -= chainlen;
497		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
498			/*
499			 * Check for "queue full" condition.  If the queue
500			 * is already full, then no more WQEs can be posted.
501			 * So break out, ring a doorbell (if necessary) and
502			 * return an error
503			 */
504			if (qp->qp_rq_wqhdr->wq_full != 0) {
505				status = IBT_QP_FULL;
506				TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull,
507				    TAVOR_TNF_TRACE, "");
508				break;
509			}
510
511			/*
512			 * Increment the "tail index" and check for "queue
513			 * full" condition.  If we detect that the current
514			 * work request is going to fill the work queue, then
515			 * we mark this condition and continue.
516			 */
517			next_tail = (tail + 1) & qsize_msk;
518			if (next_tail == head) {
519				qp->qp_rq_wqhdr->wq_full = 1;
520			}
521
522			/*
523			 * Get the address of the location where the next
524			 * Recv WQE should be built
525			 */
526			desc = TAVOR_QP_RQ_ENTRY(qp, tail);
527
528			/*
529			 * Call tavor_wqe_recv_build() to build the WQE
530			 * at the given address.  This routine uses the
531			 * information in the ibt_recv_wr_t list (wr[]) and
532			 * returns the size of the WQE when it returns.
533			 */
534			status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
535			    desc, &desc_sz);
536			if (status != DDI_SUCCESS) {
537				TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
538				    TAVOR_TNF_ERROR, "");
539				break;
540			}
541
542			/*
543			 * Add a WRID entry to the WRID list.  Need to
544			 * calculate the "wqeaddrsz" and "signaled_dbd"
545			 * values to pass to tavor_wrid_add_entry().  Note:
546			 * all Recv WQEs are essentially "signaled"
547			 */
548			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
549			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
550			    desc_sz);
551			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
552			tavor_wrid_add_entry(qp->qp_rq_wqhdr,
553			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
554
555			/*
556			 * If this is not the first descriptor on the current
557			 * chain, then link it to the previous WQE.  Otherwise,
558			 * save the address and size of this descriptor (in
559			 * "first" and "first_sz" respectively) and continue.
560			 */
561			if (currindx != 0) {
562				tavor_wqe_recv_linknext(desc, desc_sz, prev,
563				    qp);
564				prev = desc;
565			} else {
566				first	 = desc;
567				first_sz = desc_sz;
568			}
569
570			/*
571			 * Update the current "tail index" and increment
572			 * "posted_cnt"
573			 */
574			tail = next_tail;
575			posted_cnt++;
576		}
577
578		/*
579		 * If we reach here and there are one or more WQEs which have
580		 * been successfully chained together, then we need to link
581		 * the current chain to the previously executing chain of
582		 * descriptor (if there is one) and ring the doorbell for the
583		 * recv work queue.
584		 */
585		if (currindx != 0) {
586			/*
587			 * Before we link the chain, we need to ensure that the
588			 * "next" field on the last WQE is set to NULL (to
589			 * indicate the end of the chain).
590			 */
591			tavor_wqe_recv_linknext(NULL, 0, prev, qp);
592
593			/* Save away updated "tail index" for the DMA sync */
594			sync_to = tail;
595
596			/* Do a DMA sync for current recv WQE(s) */
597			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
598			    DDI_DMA_SYNC_FORDEV);
599
600			/*
601			 * Now link the chain to the old chain (if there was
602			 * one.
603			 */
604			tavor_wqe_recv_linknext(first, first_sz,
605			    qp->qp_rq_lastwqeaddr, qp);
606
607			/*
608			 * If there was a valid previous WQE (i.e. non-NULL),
609			 * then sync it too.  This is because we have updated
610			 * its "next" fields and we want to ensure that the
611			 * hardware can see the changes.
612			 */
613			if (qp->qp_rq_lastwqeaddr != NULL) {
614				sync_to	  = sync_from;
615				sync_from = (sync_from - 1) & qsize_msk;
616				tavor_wqe_sync(qp, sync_from, sync_to,
617				    TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
618			}
619
620			/*
621			 * Now if the WRID tail entry is non-NULL, then this
622			 * represents the entry to which we are chaining the
623			 * new entries.  Since we are going to ring the
624			 * doorbell for this WQE, we want set its "dbd" bit.
625			 *
626			 * On the other hand, if the tail is NULL, even though
627			 * we will have rung the doorbell for the previous WQE
628			 * (for the hardware's sake) it is irrelevant to our
629			 * purposes (for tracking WRIDs) because we know the
630			 * request must have already completed.
631			 */
632			wre_last = wridlist->wl_wre_old_tail;
633			if (wre_last != NULL) {
634				wre_last->wr_signaled_dbd |=
635				    TAVOR_WRID_ENTRY_DOORBELLED;
636			}
637
638			/* Update some of the state in the QP */
639			qp->qp_rq_lastwqeaddr	 = desc;
640			qp->qp_rq_wqhdr->wq_tail = tail;
641
642			/* Ring the doorbell */
643			tavor_qp_recv_doorbell(state,
644			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
645			    first_sz, qp->qp_qpnum, (chainlen % maxdb));
646		}
647	}
648
649	/*
650	 * Update the "num_posted" return value (if necessary).  Then drop
651	 * the locks and return success.
652	 */
653	if (num_posted != NULL) {
654		*num_posted = posted_cnt;
655	}
656
657	mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
658	mutex_exit(&qp->qp_lock);
659
660	TAVOR_TNF_EXIT(tavor_post_recv);
661	return (status);
662}
663
664/*
665 * tavor_post_srq()
666 *    Context: Can be called from interrupt or base context.
667 */
668int
669tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
670    ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
671{
672	uint64_t			*desc, *prev, *first, *last_wqe_addr;
673	uint32_t			signaled_dbd;
674	uint32_t			sync_indx;
675	uint_t				currindx, wrindx, numremain;
676	uint_t				chainlen, posted_cnt;
677	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
678	int				status;
679
680	TAVOR_TNF_ENTER(tavor_post_srq);
681
682	/*
683	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
684	 * clients to post to QP memory that is accessible directly by the
685	 * user.  If the QP memory is user accessible, then return an error.
686	 */
687	if (srq->srq_is_umap) {
688		TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type,
689		    TAVOR_TNF_ERROR, "");
690		TAVOR_TNF_EXIT(tavor_post_srq);
691		return (IBT_SRQ_HDL_INVALID);
692	}
693
694	/* Initialize posted_cnt */
695	posted_cnt = 0;
696
697	mutex_enter(&srq->srq_lock);
698
699	/*
700	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
701	 */
702	if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
703		mutex_exit(&srq->srq_lock);
704		TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail,
705		    TAVOR_TNF_ERROR, "");
706		TAVOR_TNF_EXIT(tavor_post_srq);
707		return (IBT_QP_STATE_INVALID);
708	}
709
710	/* Grab the lock for the WRID list */
711	mutex_enter(&srq->srq_wrid_wql->wql_lock);
712
713	/*
714	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
715	 * request and build a Recv WQE.  Note:  Because we are potentially
716	 * building a chain of WQEs, we want to link them all together.
717	 * However, we do not want to link the first one to the previous
718	 * WQE until the entire chain has been linked.  Then in the last
719	 * step we ring the appropriate doorbell.  Note:  It is possible for
720	 * more Work Requests to be posted than the HW will support at one
721	 * shot.  If this happens, we need to be able to post and ring
722	 * several chains here until the the entire request is complete.
723	 */
724	wrindx = 0;
725	numremain = num_wr;
726	status	  = DDI_SUCCESS;
727	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
728		/*
729		 * For the first WQE on a new chain we need "prev" to point
730		 * to the current descriptor.  As we begin to process
731		 * further, "prev" will be updated to point to the previous
732		 * WQE on the current chain (see below).
733		 */
734		if (srq->srq_wq_lastwqeindx == -1) {
735			prev = NULL;
736		} else {
737			prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
738		}
739
740		/*
741		 * Break the request up into chains that are less than or
742		 * equal to the maximum number of WQEs that can be posted
743		 * per doorbell ring
744		 */
745		chainlen = (numremain > maxdb) ? maxdb : numremain;
746		numremain -= chainlen;
747		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
748
749			/*
750			 * Check for "queue full" condition.  If the queue
751			 * is already full, then no more WQEs can be posted.
752			 * So break out, ring a doorbell (if necessary) and
753			 * return an error
754			 */
755			if (srq->srq_wridlist->wl_free_list_indx == -1) {
756				status = IBT_QP_FULL;
757				TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull,
758				    TAVOR_TNF_TRACE, "");
759				break;
760			}
761
762			/*
763			 * Get the address of the location where the next
764			 * Recv WQE should be built
765			 */
766			desc = TAVOR_SRQ_WQE_ADDR(srq,
767			    srq->srq_wridlist->wl_free_list_indx);
768
769			/*
770			 * Add a WRID entry to the WRID list.  Need to
771			 * set the "signaled_dbd" values to pass to
772			 * tavor_wrid_add_entry().  Note: all Recv WQEs are
773			 * essentially "signaled"
774			 *
775			 * The 'size' is stored at srq_alloc time, in the
776			 * srq_wq_stride.  This is a constant value required
777			 * for SRQ.
778			 */
779			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
780			tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
781			    signaled_dbd);
782
783			/*
784			 * Call tavor_wqe_srq_build() to build the WQE
785			 * at the given address.  This routine uses the
786			 * information in the ibt_recv_wr_t list (wr[]) and
787			 * returns the size of the WQE when it returns.
788			 */
789			status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
790			    desc);
791			if (status != DDI_SUCCESS) {
792				TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
793				    TAVOR_TNF_ERROR, "");
794				break;
795			}
796
797			/*
798			 * If this is not the first descriptor on the current
799			 * chain, then link it to the previous WQE.  Otherwise,
800			 * save the address of this descriptor (in "first") and
801			 * continue.
802			 */
803			if (currindx != 0) {
804				tavor_wqe_srq_linknext(desc, prev, srq);
805				sync_indx = TAVOR_SRQ_WQE_INDEX(
806				    srq->srq_wq_buf, prev,
807				    srq->srq_wq_log_wqesz);
808
809				/* Do a DMA sync for previous recv WQE */
810				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
811				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
812
813				prev = desc;
814			} else {
815
816				/*
817				 * In this case, the last WQE on the chain is
818				 * also considered 'first'.  So set prev to
819				 * first, here.
820				 */
821				first = prev = desc;
822			}
823
824			/*
825			 * Increment "posted_cnt"
826			 */
827			posted_cnt++;
828		}
829
830		/*
831		 * If we reach here and there are one or more WQEs which have
832		 * been successfully chained together, then we need to link
833		 * the current chain to the previously executing chain of
834		 * descriptor (if there is one) and ring the doorbell for the
835		 * recv work queue.
836		 */
837		if (currindx != 0) {
838			/*
839			 * Before we link the chain, we need to ensure that the
840			 * "next" field on the last WQE is set to NULL (to
841			 * indicate the end of the chain).
842			 */
843			tavor_wqe_srq_linknext(NULL, prev, srq);
844
845			sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
846			    srq->srq_wq_log_wqesz);
847
848			/* Do a DMA sync for current recv WQE */
849			tavor_wqe_sync(srq, sync_indx, sync_indx+1,
850			    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
851
852			/*
853			 * Now link the chain to the old chain (if there was
854			 * one).
855			 */
856			if (srq->srq_wq_lastwqeindx == -1) {
857				last_wqe_addr = NULL;
858			} else {
859				last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
860				    srq->srq_wq_lastwqeindx);
861			}
862			tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
863
864			/*
865			 * If there was a valid previous WQE (i.e. valid index),
866			 * then sync it too.  This is because we have updated
867			 * its "next" fields and we want to ensure that the
868			 * hardware can see the changes.
869			 */
870			if (srq->srq_wq_lastwqeindx != -1) {
871				sync_indx = srq->srq_wq_lastwqeindx;
872				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
873				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
874			}
875
876			/* Update some of the state in the QP */
877			srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
878			    srq->srq_wq_buf, desc,
879			    srq->srq_wq_log_wqesz);
880
881			/* Ring the doorbell */
882			/* SRQ needs NDS of 0 */
883			tavor_qp_recv_doorbell(state,
884			    (uint32_t)((uintptr_t)first - srq->srq_desc_off),
885			    0, srq->srq_srqnum, (chainlen % maxdb));
886		}
887	}
888
889	/*
890	 * Update the "num_posted" return value (if necessary).  Then drop
891	 * the locks and return success.
892	 */
893	if (num_posted != NULL) {
894		*num_posted = posted_cnt;
895	}
896
897	mutex_exit(&srq->srq_wrid_wql->wql_lock);
898	mutex_exit(&srq->srq_lock);
899
900	TAVOR_TNF_EXIT(tavor_post_srq);
901	return (status);
902}
903
904
905/*
906 * tavor_qp_send_doorbell()
907 *    Context: Can be called from interrupt or base context.
908 */
909static void
910tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
911    uint32_t qpn, uint32_t fence, uint32_t nopcode)
912{
913	uint64_t	doorbell = 0;
914
915	/* Build the doorbell from the parameters */
916	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
917	    TAVOR_QPSNDDB_NDA_SHIFT) |
918	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
919	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
920	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
921
922	TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "",
923	    tnf_ulong, doorbell, doorbell);
924
925	/* Write the doorbell to UAR */
926	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
927	    doorbell);
928}
929
930
931/*
932 * tavor_qp_recv_doorbell()
933 *    Context: Can be called from interrupt or base context.
934 */
935static void
936tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
937    uint32_t qpn, uint32_t credits)
938{
939	uint64_t	doorbell = 0;
940
941	/* Build the doorbell from the parameters */
942	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
943	    TAVOR_QPRCVDB_NDA_SHIFT) |
944	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
945	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
946
947	TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "",
948	    tnf_ulong, doorbell, doorbell);
949
950	/* Write the doorbell to UAR */
951	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
952	    doorbell);
953}
954
955
956/*
957 * tavor_wqe_send_build()
958 *    Context: Can be called from interrupt or base context.
959 */
960static int
961tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
962    ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
963{
964	tavor_hw_snd_wqe_ud_t		*ud;
965	tavor_hw_snd_wqe_remaddr_t	*rc;
966	tavor_hw_snd_wqe_atomic_t	*at;
967	tavor_hw_snd_wqe_remaddr_t	*uc;
968	tavor_hw_snd_wqe_bind_t		*bn;
969	tavor_hw_wqe_sgl_t		*ds;
970	ibt_wr_ds_t			*sgl;
971	tavor_ahhdl_t			ah;
972	uint32_t			nds;
973	int				i, num_ds, status;
974
975	TAVOR_TNF_ENTER(tavor_wqe_send_build);
976
977	ASSERT(MUTEX_HELD(&qp->qp_lock));
978
979	/* Initialize the information for the Data Segments */
980	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
981	    sizeof (tavor_hw_snd_wqe_nextctrl_t));
982	nds = wr->wr_nds;
983	sgl = wr->wr_sgl;
984	num_ds = 0;
985
986	/*
987	 * Build a Send WQE depends first and foremost on the transport
988	 * type of Work Request (i.e. UD, RC, or UC)
989	 */
990	switch (wr->wr_trans) {
991	case IBT_UD_SRV:
992		/* Ensure that work request transport type matches QP type */
993		if (qp->qp_serv_type != TAVOR_QP_UD) {
994			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
995			    TAVOR_TNF_ERROR, "");
996			TAVOR_TNF_EXIT(tavor_wqe_send_build);
997			return (IBT_QP_SRV_TYPE_INVALID);
998		}
999
1000		/*
1001		 * Validate the operation type.  For UD requests, only the
1002		 * "Send" operation is valid
1003		 */
1004		if (wr->wr_opcode != IBT_WRC_SEND) {
1005			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1006			    TAVOR_TNF_ERROR, "");
1007			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1008			return (IBT_QP_OP_TYPE_INVALID);
1009		}
1010
1011		/*
1012		 * If this is a Special QP (QP0 or QP1), then we need to
1013		 * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
1014		 * and return whatever status it returns
1015		 */
1016		if (qp->qp_is_special) {
1017			status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
1018			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1019			return (status);
1020		}
1021
1022		/*
1023		 * Otherwise, if this is a normal UD Send request, then fill
1024		 * all the fields in the Tavor UD header for the WQE.  Note:
1025		 * to do this we'll need to extract some information from the
1026		 * Address Handle passed with the work request.
1027		 */
1028		ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1029		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1030		ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1031		if (ah == NULL) {
1032			TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail,
1033			    TAVOR_TNF_ERROR, "");
1034			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1035			return (IBT_AH_HDL_INVALID);
1036		}
1037
1038		/*
1039		 * Build the Unreliable Datagram Segment for the WQE, using
1040		 * the information from the address handle and the work
1041		 * request.
1042		 */
1043		mutex_enter(&ah->ah_lock);
1044		TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
1045		mutex_exit(&ah->ah_lock);
1046
1047		/* Update "ds" for filling in Data Segments (below) */
1048		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
1049		    sizeof (tavor_hw_snd_wqe_ud_t));
1050		break;
1051
1052	case IBT_RC_SRV:
1053		/* Ensure that work request transport type matches QP type */
1054		if (qp->qp_serv_type != TAVOR_QP_RC) {
1055			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1056			    TAVOR_TNF_ERROR, "");
1057			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1058			return (IBT_QP_SRV_TYPE_INVALID);
1059		}
1060
1061		/*
1062		 * Validate the operation type.  For RC requests, we allow
1063		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1064		 * operations, and memory window "Bind"
1065		 */
1066		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1067		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1068		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1069		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1070		    (wr->wr_opcode != IBT_WRC_FADD) &&
1071		    (wr->wr_opcode != IBT_WRC_BIND)) {
1072			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1073			    TAVOR_TNF_ERROR, "");
1074			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1075			return (IBT_QP_OP_TYPE_INVALID);
1076		}
1077
1078		/*
1079		 * If this is a Send request, then all we need to do is break
1080		 * out and here and begin the Data Segment processing below
1081		 */
1082		if (wr->wr_opcode == IBT_WRC_SEND) {
1083			break;
1084		}
1085
1086		/*
1087		 * If this is an RDMA Read or RDMA Write request, then fill
1088		 * in the "Remote Address" header fields.
1089		 */
1090		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1091		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1092			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1093			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1094
1095			/*
1096			 * Build the Remote Address Segment for the WQE, using
1097			 * the information from the RC work request.
1098			 */
1099			TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1100
1101			/* Update "ds" for filling in Data Segments (below) */
1102			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1103			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1104			break;
1105		}
1106
1107		/*
1108		 * If this is one of the Atomic type operations (i.e
1109		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1110		 * Address" header fields and the "Atomic" header fields.
1111		 */
1112		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1113		    (wr->wr_opcode == IBT_WRC_FADD)) {
1114			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1115			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1116			at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1117			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1118
1119			/*
1120			 * Build the Remote Address and Atomic Segments for
1121			 * the WQE, using the information from the RC Atomic
1122			 * work request.
1123			 */
1124			TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1125			TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1126
1127			/* Update "ds" for filling in Data Segments (below) */
1128			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1129			    sizeof (tavor_hw_snd_wqe_atomic_t));
1130
1131			/*
1132			 * Update "nds" and "sgl" because Atomic requests have
1133			 * only a single Data Segment (and they are encoded
1134			 * somewhat differently in the work request.
1135			 */
1136			nds = 1;
1137			sgl = wr->wr_sgl;
1138			break;
1139		}
1140
1141		/*
1142		 * If this is memory window Bind operation, then we call the
1143		 * tavor_wr_bind_check() routine to validate the request and
1144		 * to generate the updated RKey.  If this is successful, then
1145		 * we fill in the WQE's "Bind" header fields.
1146		 */
1147		if (wr->wr_opcode == IBT_WRC_BIND) {
1148			status = tavor_wr_bind_check(state, wr);
1149			if (status != DDI_SUCCESS) {
1150				TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1151				    TAVOR_TNF_ERROR, "");
1152				TAVOR_TNF_EXIT(tavor_wqe_send_build);
1153				return (status);
1154			}
1155
1156			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1157			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1158
1159			/*
1160			 * Build the Bind Memory Window Segments for the WQE,
1161			 * using the information from the RC Bind memory
1162			 * window work request.
1163			 */
1164			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1165
1166			/*
1167			 * Update the "ds" pointer.  Even though the "bind"
1168			 * operation requires no SGLs, this is necessary to
1169			 * facilitate the correct descriptor size calculations
1170			 * (below).
1171			 */
1172			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1173			    sizeof (tavor_hw_snd_wqe_bind_t));
1174			nds = 0;
1175		}
1176		break;
1177
1178	case IBT_UC_SRV:
1179		/* Ensure that work request transport type matches QP type */
1180		if (qp->qp_serv_type != TAVOR_QP_UC) {
1181			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1182			    TAVOR_TNF_ERROR, "");
1183			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1184			return (IBT_QP_SRV_TYPE_INVALID);
1185		}
1186
1187		/*
1188		 * Validate the operation type.  For UC requests, we only
1189		 * allow "Send", "RDMA Write", and memory window "Bind".
1190		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1191		 * operations
1192		 */
1193		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1194		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1195		    (wr->wr_opcode != IBT_WRC_BIND)) {
1196			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1197			    TAVOR_TNF_ERROR, "");
1198			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1199			return (IBT_QP_OP_TYPE_INVALID);
1200		}
1201
1202		/*
1203		 * If this is a Send request, then all we need to do is break
1204		 * out and here and begin the Data Segment processing below
1205		 */
1206		if (wr->wr_opcode == IBT_WRC_SEND) {
1207			break;
1208		}
1209
1210		/*
1211		 * If this is an RDMA Write request, then fill in the "Remote
1212		 * Address" header fields.
1213		 */
1214		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1215			uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1216			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1217
1218			/*
1219			 * Build the Remote Address Segment for the WQE, using
1220			 * the information from the UC work request.
1221			 */
1222			TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1223
1224			/* Update "ds" for filling in Data Segments (below) */
1225			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1226			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1227			break;
1228		}
1229
1230		/*
1231		 * If this is memory window Bind operation, then we call the
1232		 * tavor_wr_bind_check() routine to validate the request and
1233		 * to generate the updated RKey.  If this is successful, then
1234		 * we fill in the WQE's "Bind" header fields.
1235		 */
1236		if (wr->wr_opcode == IBT_WRC_BIND) {
1237			status = tavor_wr_bind_check(state, wr);
1238			if (status != DDI_SUCCESS) {
1239				TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1240				    TAVOR_TNF_ERROR, "");
1241				TAVOR_TNF_EXIT(tavor_wqe_send_build);
1242				return (status);
1243			}
1244
1245			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1246			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1247
1248			/*
1249			 * Build the Bind Memory Window Segments for the WQE,
1250			 * using the information from the UC Bind memory
1251			 * window work request.
1252			 */
1253			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1254
1255			/*
1256			 * Update the "ds" pointer.  Even though the "bind"
1257			 * operation requires no SGLs, this is necessary to
1258			 * facilitate the correct descriptor size calculations
1259			 * (below).
1260			 */
1261			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1262			    sizeof (tavor_hw_snd_wqe_bind_t));
1263			nds = 0;
1264		}
1265		break;
1266
1267	default:
1268		TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail,
1269		    TAVOR_TNF_ERROR, "");
1270		TAVOR_TNF_EXIT(tavor_wqe_send_build);
1271		return (IBT_QP_SRV_TYPE_INVALID);
1272	}
1273
1274	/*
1275	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1276	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1277	 * Start by checking for a valid number of SGL entries
1278	 */
1279	if (nds > qp->qp_sq_sgl) {
1280		TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail,
1281		    TAVOR_TNF_ERROR, "");
1282		TAVOR_TNF_EXIT(tavor_wqe_send_build);
1283		return (IBT_QP_SGL_LEN_INVALID);
1284	}
1285
1286	/*
1287	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1288	 * segments.  Note: We skip any SGL with zero size because Tavor
1289	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1290	 * the encoding for zero means a 2GB transfer.  Because of this special
1291	 * encoding in the hardware, we mask the requested length with
1292	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1293	 * zero.)
1294	 */
1295	for (i = 0; i < nds; i++) {
1296		if (sgl[i].ds_len == 0) {
1297			continue;
1298		}
1299
1300		/*
1301		 * Fill in the Data Segment(s) for the current WQE, using the
1302		 * information contained in the scatter-gather list of the
1303		 * work request.
1304		 */
1305		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1306		num_ds++;
1307	}
1308
1309	/* Return the size of descriptor (in 16-byte chunks) */
1310	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1311
1312	TAVOR_TNF_EXIT(tavor_wqe_send_build);
1313	return (DDI_SUCCESS);
1314}
1315
1316
1317/*
1318 * tavor_wqe_send_linknext()
1319 *    Context: Can be called from interrupt or base context.
1320 */
1321static void
1322tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1323    uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1324    tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1325{
1326	uint64_t	next, ctrl;
1327	uint32_t	nopcode, fence;
1328
1329	/*
1330	 * Calculate the "next" field of the descriptor.  This amounts to
1331	 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1332	 * fields (see tavor_hw.h for more).  Note:  If there is no next
1333	 * descriptor (i.e. if the current descriptor is the last WQE on
1334	 * the chain), then set "next" to zero.
1335	 */
1336	if (curr_desc != NULL) {
1337		/*
1338		 * Determine the value for the Tavor WQE "nopcode" field
1339		 * by using the IBTF opcode from the work request
1340		 */
1341		switch (curr_wr->wr_opcode) {
1342		case IBT_WRC_RDMAW:
1343			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1344				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1345			} else {
1346				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1347			}
1348			break;
1349
1350		case IBT_WRC_SEND:
1351			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1352				nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1353			} else {
1354				nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1355			}
1356			break;
1357
1358		case IBT_WRC_RDMAR:
1359			nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1360			break;
1361
1362		case IBT_WRC_CSWAP:
1363			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1364			break;
1365
1366		case IBT_WRC_FADD:
1367			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1368			break;
1369
1370		case IBT_WRC_BIND:
1371			nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1372			break;
1373		}
1374
1375		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1376		    - qp->qp_desc_off);
1377		next  = ((uint64_t)(uintptr_t)curr_desc &
1378		    TAVOR_WQE_NDA_MASK) << 32;
1379		next  = next | ((uint64_t)nopcode << 32);
1380		fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1381		if (fence) {
1382			next = next | TAVOR_WQE_SEND_FENCE_MASK;
1383		}
1384		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1385
1386		/*
1387		 * If a send queue doorbell will be rung for the next
1388		 * WQE on the chain, then set the current WQE's "dbd" bit.
1389		 * Note: We also update the "dbinfo" structure here to pass
1390		 * back information about what should (later) be included
1391		 * in the send queue doorbell.
1392		 */
1393		if (dbinfo) {
1394			next = next | TAVOR_WQE_DBD_MASK;
1395			dbinfo->db_nopcode = nopcode;
1396			dbinfo->db_fence   = fence;
1397		}
1398	} else {
1399		next = 0;
1400	}
1401
1402	/*
1403	 * If this WQE is supposed to be linked to the previous descriptor,
1404	 * then we need to update not only the previous WQE's "next" fields
1405	 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1406	 * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1407	 * the "e" bit is always hardcoded to zero.
1408	 */
1409	if (prev_desc != NULL) {
1410		/*
1411		 * If a send queue doorbell will be rung for the next WQE on
1412		 * the chain, then update the current WQE's "next" field and
1413		 * return.
1414		 * Note: We don't want to modify the "ctrl" field here because
1415		 * that portion of the previous WQE has already been set
1416		 * correctly at some previous point in time.
1417		 */
1418		if (dbinfo) {
1419			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1420			return;
1421		}
1422
1423		ctrl = 0;
1424
1425		/* Set the "c" (i.e. "signaled") bit appropriately */
1426		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1427			ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1428		}
1429
1430		/* Set the "s" (i.e. "solicited") bit appropriately */
1431		if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1432			ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1433		}
1434
1435		/* Set the "i" bit and the immediate data appropriately */
1436		if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1437			ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1438			ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1439		}
1440
1441		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1442	}
1443}
1444
1445
1446/*
1447 * tavor_wqe_mlx_build()
1448 *    Context: Can be called from interrupt or base context.
1449 */
1450static int
1451tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1452    ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1453{
1454	tavor_hw_udav_t		udav;
1455	tavor_ahhdl_t		ah;
1456	ib_lrh_hdr_t		*lrh;
1457	ib_grh_t		*grh;
1458	ib_bth_hdr_t		*bth;
1459	ib_deth_hdr_t		*deth;
1460	tavor_hw_wqe_sgl_t	*ds;
1461	ibt_wr_ds_t		*sgl;
1462	uint8_t			*mgmtclass, *hpoint, *hcount;
1463	uint64_t		data;
1464	uint32_t		nds, offset, pktlen;
1465	uint32_t		desc_sz, udav_sz;
1466	int			i, num_ds;
1467
1468	TAVOR_TNF_ENTER(tavor_wqe_mlx_build);
1469
1470	ASSERT(MUTEX_HELD(&qp->qp_lock));
1471
1472	/* Initialize the information for the Data Segments */
1473	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1474	    sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1475
1476	/*
1477	 * Pull the address handle from the work request and read in
1478	 * the contents of the UDAV.  This will be used to answer some
1479	 * questions about the request.
1480	 */
1481	ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1482	if (ah == NULL) {
1483		TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail,
1484		    TAVOR_TNF_ERROR, "");
1485		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1486		return (IBT_AH_HDL_INVALID);
1487	}
1488	mutex_enter(&ah->ah_lock);
1489	udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1490	for (i = 0; i < udav_sz; i++) {
1491		data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1492		    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1493		((uint64_t *)&udav)[i] = data;
1494	}
1495	mutex_exit(&ah->ah_lock);
1496
1497	/*
1498	 * If the request is for QP1 and the destination LID is equal to
1499	 * the Permissive LID, then return an error.  This combination is
1500	 * not allowed
1501	 */
1502	if ((udav.rlid == IB_LID_PERMISSIVE) &&
1503	    (qp->qp_is_special == TAVOR_QP_GSI)) {
1504		TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail,
1505		    TAVOR_TNF_ERROR, "");
1506		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1507		return (IBT_AH_HDL_INVALID);
1508	}
1509
1510	/*
1511	 * Calculate the size of the packet headers, including the GRH
1512	 * (if necessary)
1513	 */
1514	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1515	    sizeof (ib_deth_hdr_t);
1516	if (udav.grh) {
1517		desc_sz += sizeof (ib_grh_t);
1518	}
1519
1520	/*
1521	 * Begin to build the first "inline" data segment for the packet
1522	 * headers.  Note:  By specifying "inline" we can build the contents
1523	 * of the MAD packet headers directly into the work queue (as part
1524	 * descriptor).  This has the advantage of both speeding things up
1525	 * and of not requiring the driver to allocate/register any additional
1526	 * memory for the packet headers.
1527	 */
1528	TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1529	desc_sz += 4;
1530
1531	/*
1532	 * Build Local Route Header (LRH)
1533	 *    We start here by building the LRH into a temporary location.
1534	 *    When we have finished we copy the LRH data into the descriptor.
1535	 *
1536	 *    Notice that the VL values are hardcoded.  This is not a problem
1537	 *    because VL15 is decided later based on the value in the MLX
1538	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1539	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1540	 *    values.  This rule does not hold for loopback packets however
1541	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1542	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1543	 *
1544	 *    Notice also that Source LID is hardcoded to the Permissive LID
1545	 *    (0xFFFF).  This is also not a problem because if the Destination
1546	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1547	 *    transport "next/ctrl" header will be set to zero and the hardware
1548	 *    will pull the LID from value in the port.
1549	 */
1550	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1551	pktlen = (desc_sz + 0x100) >> 2;
1552	TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1553
1554	/*
1555	 * Build Global Route Header (GRH)
1556	 *    This is only built if necessary as defined by the "grh" bit in
1557	 *    the address vector.  Note:  We also calculate the offset to the
1558	 *    next header (BTH) based on whether or not the "grh" bit is set.
1559	 */
1560	if (udav.grh) {
1561		/*
1562		 * If the request is for QP0, then return an error.  The
1563		 * combination of global routine (GRH) and QP0 is not allowed.
1564		 */
1565		if (qp->qp_is_special == TAVOR_QP_SMI) {
1566			TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail,
1567			    TAVOR_TNF_ERROR, "");
1568			TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1569			return (IBT_AH_HDL_INVALID);
1570		}
1571		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1572		TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1573
1574		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1575	} else {
1576		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1577	}
1578
1579
1580	/*
1581	 * Build Base Transport Header (BTH)
1582	 *    Notice that the M, PadCnt, and TVer fields are all set
1583	 *    to zero implicitly.  This is true for all Management Datagrams
1584	 *    MADs whether GSI are SMI.
1585	 */
1586	TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1587
1588	/*
1589	 * Build Datagram Extended Transport Header (DETH)
1590	 */
1591	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1592	TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1593
1594	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1595	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1596	ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1597	nds = wr->wr_nds;
1598	sgl = wr->wr_sgl;
1599	num_ds = 0;
1600
1601	/*
1602	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1603	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1604	 * Start by checking for a valid number of SGL entries
1605	 */
1606	if (nds > qp->qp_sq_sgl) {
1607		TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail,
1608		    TAVOR_TNF_ERROR, "");
1609		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1610		return (IBT_QP_SGL_LEN_INVALID);
1611	}
1612
1613	/*
1614	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1615	 * segments.  Note: We skip any SGL with zero size because Tavor
1616	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1617	 * the encoding for zero means a 2GB transfer.  Because of this special
1618	 * encoding in the hardware, we mask the requested length with
1619	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1620	 * zero.)
1621	 */
1622	mgmtclass = hpoint = hcount = NULL;
1623	offset = 0;
1624	for (i = 0; i < nds; i++) {
1625		if (sgl[i].ds_len == 0) {
1626			continue;
1627		}
1628
1629		/*
1630		 * Fill in the Data Segment(s) for the MLX send WQE, using
1631		 * the information contained in the scatter-gather list of
1632		 * the work request.
1633		 */
1634		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1635
1636		/*
1637		 * Search through the contents of all MADs posted to QP0 to
1638		 * initialize pointers to the places where Directed Route "hop
1639		 * pointer", "hop count", and "mgmtclass" would be.  Tavor
1640		 * needs these updated (i.e. incremented or decremented, as
1641		 * necessary) by software.
1642		 */
1643		if (qp->qp_is_special == TAVOR_QP_SMI) {
1644
1645			TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1646			    offset, sgl[i].ds_va, sgl[i].ds_len);
1647
1648			TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1649			    offset, sgl[i].ds_va, sgl[i].ds_len);
1650
1651			TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1652			    offset, sgl[i].ds_va, sgl[i].ds_len);
1653
1654			offset += sgl[i].ds_len;
1655		}
1656		num_ds++;
1657	}
1658
1659	/*
1660	 * Tavor's Directed Route MADs need to have the "hop pointer"
1661	 * incremented/decremented (as necessary) depending on whether it is
1662	 * currently less than or greater than the "hop count" (i.e. whether
1663	 * the MAD is a request or a response.)
1664	 */
1665	if (qp->qp_is_special == TAVOR_QP_SMI) {
1666		TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1667		    *hpoint, *hcount);
1668	}
1669
1670	/*
1671	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1672	 * just like the packets headers above, but it is only four bytes and
1673	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1674	 */
1675	TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1676	num_ds++;
1677
1678	/* Return the size of descriptor (in 16-byte chunks) */
1679	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1680
1681	TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1682	return (DDI_SUCCESS);
1683}
1684
1685
1686/*
1687 * tavor_wqe_mlx_linknext()
1688 *    Context: Can be called from interrupt or base context.
1689 */
1690static void
1691tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1692    uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1693    tavor_qphdl_t qp)
1694{
1695	tavor_hw_udav_t		udav;
1696	tavor_ahhdl_t		ah;
1697	uint64_t		next, ctrl, data;
1698	uint_t			nopcode;
1699	uint_t			udav_sz;
1700	int			i;
1701
1702	/*
1703	 * Calculate the "next" field of the descriptor.  This amounts to
1704	 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1705	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1706	 * if the current descriptor is the last WQE on the chain), then set
1707	 * "next" to zero.
1708	 */
1709	if (curr_desc != NULL) {
1710		/*
1711		 * The only valid Tavor WQE "nopcode" for MLX transport
1712		 * requests is the "Send" code.
1713		 */
1714		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1715		curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1716		    (uintptr_t)curr_desc - qp->qp_desc_off);
1717		next = (uint64_t)((uintptr_t)curr_desc &
1718		    TAVOR_WQE_NDA_MASK) << 32;
1719		next = next | ((uint64_t)nopcode << 32);
1720		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1721
1722		/*
1723		 * If a send queue doorbell will be rung for the next
1724		 * WQE on the chain, then set the current WQE's "dbd" bit.
1725		 * Note: We also update the "dbinfo" structure here to pass
1726		 * back information about what should (later) be included
1727		 * in the send queue doorbell.
1728		 */
1729		if (dbinfo) {
1730			next = next | TAVOR_WQE_DBD_MASK;
1731			dbinfo->db_nopcode = nopcode;
1732			dbinfo->db_fence   = 0;
1733		}
1734	} else {
1735		next = 0;
1736	}
1737
1738	/*
1739	 * If this WQE is supposed to be linked to the previous descriptor,
1740	 * then we need to update not only the previous WQE's "next" fields
1741	 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1742	 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1743	 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1744	 * always hardcoded to zero.
1745	 */
1746	if (prev_desc != NULL) {
1747		/*
1748		 * If a send queue doorbell will be rung for the next WQE on
1749		 * the chain, then update the current WQE's "next" field and
1750		 * return.
1751		 * Note: We don't want to modify the "ctrl" field here because
1752		 * that portion of the previous WQE has already been set
1753		 * correctly at some previous point in time.
1754		 */
1755		if (dbinfo) {
1756			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1757			return;
1758		}
1759
1760		/*
1761		 * Pull the address handle from the work request and read in
1762		 * the contents of the UDAV.  This will be used to answer some
1763		 * questions about the request.
1764		 */
1765		ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1766		mutex_enter(&ah->ah_lock);
1767		udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1768		for (i = 0; i < udav_sz; i++) {
1769			data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1770			    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1771			((uint64_t *)&udav)[i] = data;
1772		}
1773		mutex_exit(&ah->ah_lock);
1774
1775		ctrl = 0;
1776
1777		/* Only QP0 uses VL15, otherwise use VL in the packet */
1778		if (qp->qp_is_special == TAVOR_QP_SMI) {
1779			ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1780		}
1781
1782		/*
1783		 * The SLR (Source LID Replace) bit determines whether the
1784		 * source LID for an outgoing MLX packet should come from the
1785		 * PortInfo (SLR = 0) or should be left as it is in the
1786		 * descriptor (SLR = 1).  The latter is necessary for packets
1787		 * to be sent with the Permissive LID.
1788		 */
1789		if (udav.rlid == IB_LID_PERMISSIVE) {
1790			ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1791		}
1792
1793		/* Fill in the max static rate from the address handle */
1794		ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1795		    TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1796
1797		/* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1798		if (qp->qp_is_special != TAVOR_QP_SMI) {
1799			ctrl = ctrl | ((uint64_t)udav.sl <<
1800			    TAVOR_WQE_MLXHDR_SL_SHIFT);
1801		}
1802
1803		/* Set the "c" (i.e. "signaled") bit appropriately */
1804		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1805			ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1806		}
1807
1808		/* Fill in the destination LID from the address handle */
1809		ctrl = ctrl | ((uint64_t)udav.rlid <<
1810		    TAVOR_WQE_MLXHDR_RLID_SHIFT);
1811
1812		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1813	}
1814}
1815
1816
1817/*
1818 * tavor_wqe_recv_build()
1819 *    Context: Can be called from interrupt or base context.
1820 */
1821/* ARGSUSED */
1822static int
1823tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1824    ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1825{
1826	tavor_hw_wqe_sgl_t	*ds;
1827	int			i, num_ds;
1828
1829	TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1830
1831	ASSERT(MUTEX_HELD(&qp->qp_lock));
1832
1833	/* Check that work request transport type is valid */
1834	if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1835	    (qp->qp_serv_type != TAVOR_QP_RC) &&
1836	    (qp->qp_serv_type != TAVOR_QP_UC)) {
1837		TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail,
1838		    TAVOR_TNF_ERROR, "");
1839		TAVOR_TNF_EXIT(tavor_build_recv_wqe);
1840		return (IBT_QP_SRV_TYPE_INVALID);
1841	}
1842
1843	/* Fill in the Data Segments (SGL) for the Recv WQE */
1844	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1845	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1846	num_ds = 0;
1847
1848	/* Check for valid number of SGL entries */
1849	if (wr->wr_nds > qp->qp_rq_sgl) {
1850		TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail,
1851		    TAVOR_TNF_ERROR, "");
1852		TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1853		return (IBT_QP_SGL_LEN_INVALID);
1854	}
1855
1856	/*
1857	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1858	 * segments.  Note: We skip any SGL with zero size because Tavor
1859	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1860	 * the encoding for zero means a 2GB transfer.  Because of this special
1861	 * encoding in the hardware, we mask the requested length with
1862	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1863	 * zero.)
1864	 */
1865	for (i = 0; i < wr->wr_nds; i++) {
1866		if (wr->wr_sgl[i].ds_len == 0) {
1867			continue;
1868		}
1869
1870		/*
1871		 * Fill in the Data Segment(s) for the receive WQE, using the
1872		 * information contained in the scatter-gather list of the
1873		 * work request.
1874		 */
1875		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1876		num_ds++;
1877	}
1878
1879	/* Return the size of descriptor (in 16-byte chunks) */
1880	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1881
1882	TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1883	return (DDI_SUCCESS);
1884}
1885
1886
1887/*
1888 * tavor_wqe_recv_linknext()
1889 *    Context: Can be called from interrupt or base context.
1890 */
1891static void
1892tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1893    uint64_t *prev_desc, tavor_qphdl_t qp)
1894{
1895	uint64_t	next;
1896
1897	/*
1898	 * Calculate the "next" field of the descriptor.  This amounts to
1899	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1900	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1901	 * if the current descriptor is the last WQE on the chain), then set
1902	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1903	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1904	 * In either case, we must add a single bit in the "reserved" field
1905	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1906	 * workaround for a known Tavor errata that can cause Recv WQEs with
1907	 * zero in the NDA field to behave improperly.
1908	 */
1909	if (curr_desc != NULL) {
1910		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1911		    qp->qp_desc_off);
1912		next = (uint64_t)((uintptr_t)curr_desc &
1913		    TAVOR_WQE_NDA_MASK) << 32;
1914		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1915		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1916	} else {
1917		next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1918	}
1919
1920	/*
1921	 * If this WQE is supposed to be linked to the previous descriptor,
1922	 * then we need to update not only the previous WQE's "next" fields
1923	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1924	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1925	 * bits are always hardcoded to zero.
1926	 */
1927	if (prev_desc != NULL) {
1928		TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1929	}
1930}
1931
1932
1933/*
1934 * tavor_wqe_srq_build()
1935 *    Context: Can be called from interrupt or base context.
1936 */
1937/* ARGSUSED */
1938static int
1939tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1940    ibt_recv_wr_t *wr, uint64_t *desc)
1941{
1942	tavor_hw_wqe_sgl_t	*ds;
1943	ibt_wr_ds_t		end_sgl;
1944	int			i, num_ds;
1945
1946	TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1947
1948	ASSERT(MUTEX_HELD(&srq->srq_lock));
1949
1950	/* Fill in the Data Segments (SGL) for the Recv WQE */
1951	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1952	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1953	num_ds = 0;
1954
1955	/* Check for valid number of SGL entries */
1956	if (wr->wr_nds > srq->srq_wq_sgl) {
1957		TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail,
1958		    TAVOR_TNF_ERROR, "");
1959		TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1960		return (IBT_QP_SGL_LEN_INVALID);
1961	}
1962
1963	/*
1964	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1965	 * segments.  Note: We skip any SGL with zero size because Tavor
1966	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1967	 * the encoding for zero means a 2GB transfer.  Because of this special
1968	 * encoding in the hardware, we mask the requested length with
1969	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1970	 * zero.)
1971	 */
1972	for (i = 0; i < wr->wr_nds; i++) {
1973		if (wr->wr_sgl[i].ds_len == 0) {
1974			continue;
1975		}
1976
1977		/*
1978		 * Fill in the Data Segment(s) for the receive WQE, using the
1979		 * information contained in the scatter-gather list of the
1980		 * work request.
1981		 */
1982		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1983		num_ds++;
1984	}
1985
1986	/*
1987	 * For SRQ, if the number of data segments is less than the maximum
1988	 * specified at alloc, then we have to fill in a special "key" entry in
1989	 * the sgl entry after the last valid one in this post request.  We do
1990	 * that here.
1991	 */
1992	if (num_ds < srq->srq_wq_sgl) {
1993		end_sgl.ds_va  = 0;
1994		end_sgl.ds_len = 0;
1995		end_sgl.ds_key = 0x1;
1996		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1997	}
1998
1999	TAVOR_TNF_EXIT(tavor_wqe_srq_build);
2000	return (DDI_SUCCESS);
2001}
2002
2003
2004/*
2005 * tavor_wqe_srq_linknext()
2006 *    Context: Can be called from interrupt or base context.
2007 */
2008static void
2009tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
2010    tavor_srqhdl_t srq)
2011{
2012	uint64_t	next;
2013
2014	/*
2015	 * Calculate the "next" field of the descriptor.  This amounts to
2016	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
2017	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
2018	 * if the current descriptor is the last WQE on the chain), then set
2019	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
2020	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
2021	 * In either case, we must add a single bit in the "reserved" field
2022	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2023	 * workaround for a known Tavor errata that can cause Recv WQEs with
2024	 * zero in the NDA field to behave improperly.
2025	 */
2026	if (curr_desc != NULL) {
2027		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
2028		    srq->srq_desc_off);
2029		next = (uint64_t)((uintptr_t)curr_desc &
2030		    TAVOR_WQE_NDA_MASK) << 32;
2031		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2032	} else {
2033		next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2034	}
2035
2036	/*
2037	 * If this WQE is supposed to be linked to the previous descriptor,
2038	 * then we need to update not only the previous WQE's "next" fields
2039	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
2040	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
2041	 * bits are always hardcoded to zero.
2042	 */
2043	if (prev_desc != NULL) {
2044		TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
2045	}
2046}
2047
2048
2049/*
2050 * tavor_wr_get_immediate()
2051 *    Context: Can be called from interrupt or base context.
2052 */
2053static uint32_t
2054tavor_wr_get_immediate(ibt_send_wr_t *wr)
2055{
2056	/*
2057	 * This routine extracts the "immediate data" from the appropriate
2058	 * location in the IBTF work request.  Because of the way the
2059	 * work request structure is defined, the location for this data
2060	 * depends on the actual work request operation type.
2061	 */
2062
2063	/* For RDMA Write, test if RC or UC */
2064	if (wr->wr_opcode == IBT_WRC_RDMAW) {
2065		if (wr->wr_trans == IBT_RC_SRV) {
2066			return (wr->wr.rc.rcwr.rdma.rdma_immed);
2067		} else {  /* IBT_UC_SRV */
2068			return (wr->wr.uc.ucwr.rdma.rdma_immed);
2069		}
2070	}
2071
2072	/* For Send, test if RC, UD, or UC */
2073	if (wr->wr_opcode == IBT_WRC_SEND) {
2074		if (wr->wr_trans == IBT_RC_SRV) {
2075			return (wr->wr.rc.rcwr.send_immed);
2076		} else if (wr->wr_trans == IBT_UD_SRV) {
2077			return (wr->wr.ud.udwr_immed);
2078		} else {  /* IBT_UC_SRV */
2079			return (wr->wr.uc.ucwr.send_immed);
2080		}
2081	}
2082
2083	/*
2084	 * If any other type of request, then immediate is undefined
2085	 */
2086	return (0);
2087}
2088
2089
2090/*
2091 * tavor_wqe_sync()
2092 *    Context: Can be called from interrupt or base context.
2093 */
2094static void
2095tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
2096    uint_t sync_type, uint_t flag)
2097{
2098	tavor_qphdl_t		qp;
2099	tavor_srqhdl_t		srq;
2100	uint_t			is_sync_req;
2101	uint64_t		*wqe_from, *wqe_to, *wqe_base, *wqe_top;
2102	ddi_dma_handle_t	dmahdl;
2103	off_t			offset;
2104	size_t			length;
2105	uint32_t		qsize;
2106	int			status;
2107
2108	TAVOR_TNF_ENTER(tavor_wqe_sync);
2109
2110	if (sync_type == TAVOR_WR_SRQ) {
2111		srq = (tavor_srqhdl_t)hdl;
2112		is_sync_req = srq->srq_sync;
2113		/* Get the DMA handle from SRQ context */
2114		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2115	} else {
2116		qp = (tavor_qphdl_t)hdl;
2117		is_sync_req = qp->qp_sync;
2118		/* Get the DMA handle from QP context */
2119		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2120	}
2121
2122	/* Determine if the work queues need to be synced or not */
2123	if (is_sync_req == 0) {
2124		TAVOR_TNF_EXIT(tavor_wqe_sync);
2125		return;
2126	}
2127
2128	/*
2129	 * Depending on the type of the work queue, we grab information
2130	 * about the address ranges we need to DMA sync.
2131	 */
2132	if (sync_type == TAVOR_WR_SEND) {
2133		wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
2134		wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
2135		qsize	 = qp->qp_sq_bufsz;
2136
2137		wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2138		wqe_top	 = TAVOR_QP_SQ_ENTRY(qp, qsize);
2139	} else if (sync_type == TAVOR_WR_RECV) {
2140		wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2141		wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2142		qsize	 = qp->qp_rq_bufsz;
2143
2144		wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2145		wqe_top	 = TAVOR_QP_RQ_ENTRY(qp, qsize);
2146	} else {
2147		wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2148		wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2149		qsize	 = srq->srq_wq_bufsz;
2150
2151		wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2152		wqe_top	 = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2153	}
2154
2155	/*
2156	 * There are two possible cases for the beginning and end of the WQE
2157	 * chain we are trying to sync.  Either this is the simple case, where
2158	 * the end of the chain is below the beginning of the chain, or it is
2159	 * the "wrap-around" case, where the end of the chain has wrapped over
2160	 * the end of the queue.  In the former case, we simply need to
2161	 * calculate the span from beginning to end and sync it.  In the latter
2162	 * case, however, we need to calculate the span from the top of the
2163	 * work queue to the end of the chain and sync that, and then we need
2164	 * to find the other portion (from beginning of chain to end of queue)
2165	 * and sync that as well.  Note: if the "top to end" span is actually
2166	 * zero length, then we don't do a DMA sync because a zero length DMA
2167	 * sync unnecessarily syncs the entire work queue.
2168	 */
2169	if (wqe_to > wqe_from) {
2170		/* "From Beginning to End" */
2171		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2172		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2173
2174		status = ddi_dma_sync(dmahdl, offset, length, flag);
2175		if (status != DDI_SUCCESS) {
2176			TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2177			TAVOR_TNF_EXIT(tavor_wqe_sync);
2178			return;
2179		}
2180	} else {
2181		/* "From Top to End" */
2182		offset = (off_t)0;
2183		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2184		if (length) {
2185			status = ddi_dma_sync(dmahdl, offset, length, flag);
2186			if (status != DDI_SUCCESS) {
2187				TNF_PROBE_0(tavor_wqe_sync_fail,
2188				    TAVOR_TNF_ERROR, "");
2189				TAVOR_TNF_EXIT(tavor_wqe_sync);
2190				return;
2191			}
2192		}
2193
2194		/* "From Beginning to Bottom" */
2195		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2196		length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2197		status = ddi_dma_sync(dmahdl, offset, length, flag);
2198		if (status != DDI_SUCCESS) {
2199			TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2200			TAVOR_TNF_EXIT(tavor_wqe_sync);
2201			return;
2202		}
2203	}
2204
2205	TAVOR_TNF_EXIT(tavor_wqe_sync);
2206}
2207
2208
2209/*
2210 * tavor_wr_bind_check()
2211 *    Context: Can be called from interrupt or base context.
2212 */
2213static int
2214tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2215{
2216	ibt_bind_flags_t	bind_flags;
2217	uint64_t		vaddr, len;
2218	uint64_t		reg_start_addr, reg_end_addr;
2219	tavor_mwhdl_t		mw;
2220	tavor_mrhdl_t		mr;
2221	tavor_rsrc_t		*mpt;
2222	uint32_t		new_rkey;
2223
2224	TAVOR_TNF_ENTER(tavor_wr_bind_check);
2225
2226	/* Check for a valid Memory Window handle in the WR */
2227	mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2228	if (mw == NULL) {
2229		TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail,
2230		    TAVOR_TNF_ERROR, "");
2231		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2232		return (IBT_MW_HDL_INVALID);
2233	}
2234
2235	/* Check for a valid Memory Region handle in the WR */
2236	mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2237	if (mr == NULL) {
2238		TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail,
2239		    TAVOR_TNF_ERROR, "");
2240		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2241		return (IBT_MR_HDL_INVALID);
2242	}
2243
2244	mutex_enter(&mr->mr_lock);
2245	mutex_enter(&mw->mr_lock);
2246
2247	/*
2248	 * Check here to see if the memory region has already been partially
2249	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2250	 * If so, this is an error, return failure.
2251	 */
2252	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2253		mutex_exit(&mr->mr_lock);
2254		mutex_exit(&mw->mr_lock);
2255		TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail,
2256		    TAVOR_TNF_ERROR, "");
2257		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2258		return (IBT_MR_HDL_INVALID);
2259	}
2260
2261	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2262	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2263		mutex_exit(&mr->mr_lock);
2264		mutex_exit(&mw->mr_lock);
2265		TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail,
2266		    TAVOR_TNF_ERROR, "");
2267		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2268		return (IBT_MR_RKEY_INVALID);
2269	}
2270
2271	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2272	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2273		mutex_exit(&mr->mr_lock);
2274		mutex_exit(&mw->mr_lock);
2275		TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail,
2276		    TAVOR_TNF_ERROR, "");
2277		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2278		return (IBT_MR_LKEY_INVALID);
2279	}
2280
2281	/*
2282	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2283	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2284	 */
2285	len = wr->wr.rc.rcwr.bind->bind_len;
2286	if (len != 0) {
2287		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2288		reg_start_addr = mr->mr_bindinfo.bi_addr;
2289		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2290		    (mr->mr_bindinfo.bi_len - 1);
2291		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2292			mutex_exit(&mr->mr_lock);
2293			mutex_exit(&mw->mr_lock);
2294			TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail,
2295			    TAVOR_TNF_ERROR, "");
2296			TAVOR_TNF_EXIT(tavor_wr_bind_check);
2297			return (IBT_MR_VA_INVALID);
2298		}
2299		vaddr = (vaddr + len) - 1;
2300		if (vaddr > reg_end_addr) {
2301			mutex_exit(&mr->mr_lock);
2302			mutex_exit(&mw->mr_lock);
2303			TNF_PROBE_0(tavor_wr_bind_check_invlen_fail,
2304			    TAVOR_TNF_ERROR, "");
2305			TAVOR_TNF_EXIT(tavor_wr_bind_check);
2306			return (IBT_MR_LEN_INVALID);
2307		}
2308	}
2309
2310	/*
2311	 * Validate the bind access flags.  Remote Write and Atomic access for
2312	 * the Memory Window require that Local Write access be set in the
2313	 * corresponding Memory Region.
2314	 */
2315	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2316	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2317	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2318	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2319		mutex_exit(&mr->mr_lock);
2320		mutex_exit(&mw->mr_lock);
2321		TNF_PROBE_0(tavor_wr_bind_check_invflags_fail,
2322		    TAVOR_TNF_ERROR, "");
2323		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2324		return (IBT_MR_ACCESS_REQ_INVALID);
2325	}
2326
2327	/* Calculate the new RKey for the Memory Window */
2328	mpt = mw->mr_mptrsrcp;
2329	tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2330
2331	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2332	mw->mr_rkey = new_rkey;
2333
2334	mutex_exit(&mr->mr_lock);
2335	mutex_exit(&mw->mr_lock);
2336	TAVOR_TNF_EXIT(tavor_wr_bind_check);
2337	return (DDI_SUCCESS);
2338}
2339
2340
2341/*
2342 * tavor_wrid_from_reset_handling()
2343 *    Context: Can be called from interrupt or base context.
2344 */
2345int
2346tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2347{
2348	tavor_workq_hdr_t	*swq, *rwq;
2349	tavor_wrid_list_hdr_t	*s_wridlist, *r_wridlist;
2350	uint_t			create_new_swq = 0, create_new_rwq = 0;
2351	uint_t			create_wql = 0;
2352	uint_t			qp_srq_en;
2353
2354	TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling);
2355
2356	/*
2357	 * For each of this QP's Work Queues, make sure we have a (properly
2358	 * initialized) Work Request ID list attached to the relevant
2359	 * completion queue.  Grab the CQ lock(s) before manipulating the
2360	 * lists.
2361	 */
2362	tavor_wrid_wqhdr_lock_both(qp);
2363	swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2364	    TAVOR_WR_SEND);
2365	if (swq == NULL) {
2366		/* Couldn't find matching work queue header, create it */
2367		create_new_swq = create_wql = 1;
2368		swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2369		    qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2370		if (swq == NULL) {
2371			/*
2372			 * If we couldn't find/allocate space for the workq
2373			 * header, then drop the lock(s) and return failure.
2374			 */
2375			tavor_wrid_wqhdr_unlock_both(qp);
2376			TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2377			    TAVOR_TNF_ERROR, "");
2378			TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2379			return (ibc_get_ci_failure(0));
2380		}
2381	}
2382	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2383	qp->qp_sq_wqhdr = swq;
2384	swq->wq_size = qp->qp_sq_bufsz;
2385	swq->wq_head = 0;
2386	swq->wq_tail = 0;
2387	swq->wq_full = 0;
2388
2389	/*
2390	 * Allocate space for the tavor_wrid_entry_t container
2391	 */
2392	s_wridlist = tavor_wrid_get_list(swq->wq_size);
2393	if (s_wridlist == NULL) {
2394		/*
2395		 * If we couldn't allocate space for tracking the WRID
2396		 * entries, then cleanup the workq header from above (if
2397		 * necessary, i.e. if we created the workq header).  Then
2398		 * drop the lock(s) and return failure.
2399		 */
2400		if (create_new_swq) {
2401			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2402		}
2403
2404		tavor_wrid_wqhdr_unlock_both(qp);
2405		TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2406		    TAVOR_TNF_ERROR, "");
2407		TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2408		return (ibc_get_ci_failure(0));
2409	}
2410	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2411	s_wridlist->wl_wqhdr = swq;
2412
2413	/* Chain the new WRID list container to the workq hdr list */
2414	mutex_enter(&swq->wq_wrid_wql->wql_lock);
2415	tavor_wrid_wqhdr_add(swq, s_wridlist);
2416	mutex_exit(&swq->wq_wrid_wql->wql_lock);
2417
2418	qp_srq_en = qp->qp_srq_en;
2419
2420#ifdef __lock_lint
2421	mutex_enter(&qp->qp_srqhdl->srq_lock);
2422#else
2423	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2424		mutex_enter(&qp->qp_srqhdl->srq_lock);
2425	}
2426#endif
2427	/*
2428	 * Now we repeat all the above operations for the receive work queue,
2429	 * or shared receive work queue.
2430	 *
2431	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2432	 */
2433	rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2434	    TAVOR_WR_RECV);
2435	if (rwq == NULL) {
2436		create_new_rwq = create_wql = 1;
2437
2438		/*
2439		 * If this QP is associated with an SRQ, and this isn't the
2440		 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2441		 * created.  Since the WQL is created at 'wqhdr_create' time we
2442		 * pass in the flag 'create_wql' here to be 0 if we have
2443		 * already created it.  And later on below we then next setup
2444		 * the WQL and rwq information based off the existing SRQ info.
2445		 */
2446		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2447		    qp->qp_srqhdl->srq_wrid_wql != NULL) {
2448			create_wql = 0;
2449		}
2450
2451		rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2452		    qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2453		if (rwq == NULL) {
2454			/*
2455			 * If we couldn't find/allocate space for the workq
2456			 * header, then free all the send queue resources we
2457			 * just allocated and setup (above), drop the lock(s)
2458			 * and return failure.
2459			 */
2460			mutex_enter(&swq->wq_wrid_wql->wql_lock);
2461			tavor_wrid_wqhdr_remove(swq, s_wridlist);
2462			mutex_exit(&swq->wq_wrid_wql->wql_lock);
2463			if (create_new_swq) {
2464				tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2465				    swq);
2466			}
2467
2468#ifdef __lock_lint
2469			mutex_exit(&qp->qp_srqhdl->srq_lock);
2470#else
2471			if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2472				mutex_exit(&qp->qp_srqhdl->srq_lock);
2473			}
2474#endif
2475
2476			tavor_wrid_wqhdr_unlock_both(qp);
2477			TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2478			    TAVOR_TNF_ERROR, "");
2479			TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2480			return (ibc_get_ci_failure(0));
2481		}
2482	}
2483	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2484
2485	/*
2486	 * Setup receive workq hdr
2487	 *
2488	 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2489	 * keeping a copy of the rwq pointer, setting the rwq bufsize
2490	 * appropriately, and initializing our part of the WQLock.
2491	 *
2492	 * In the normal QP case, the QP recv queue bufsize is used.
2493	 */
2494	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2495		rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2496		if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2497			qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2498		} else {
2499			rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2500		}
2501		tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2502
2503	} else {
2504		rwq->wq_size = qp->qp_rq_bufsz;
2505	}
2506
2507	qp->qp_rq_wqhdr = rwq;
2508	rwq->wq_head = 0;
2509	rwq->wq_tail = 0;
2510	rwq->wq_full = 0;
2511
2512	/*
2513	 * Allocate space for the tavor_wrid_entry_t container.
2514	 *
2515	 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2516	 * allocate the wridlist normally.  However, if the srq_wridlist is !=
2517	 * NULL, then we know this SRQ has already been initialized, thus the
2518	 * wridlist has already been initialized.  So we re-use the
2519	 * srq_wridlist as the r_wridlist for this QP in this case.
2520	 */
2521	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2522	    qp->qp_srqhdl->srq_wridlist != NULL) {
2523		/* Use existing srq_wridlist pointer */
2524		r_wridlist = qp->qp_srqhdl->srq_wridlist;
2525		ASSERT(r_wridlist != NULL);
2526	} else {
2527		/* Allocate memory for the r_wridlist */
2528		r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2529	}
2530
2531	/*
2532	 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2533	 * is mistakenly NULL), we cleanup our previous swq allocation from
2534	 * above
2535	 */
2536	if (r_wridlist == NULL) {
2537		/*
2538		 * If we couldn't allocate space for tracking the WRID
2539		 * entries, then cleanup all the stuff from above.  Then
2540		 * drop the lock(s) and return failure.
2541		 */
2542		mutex_enter(&swq->wq_wrid_wql->wql_lock);
2543		tavor_wrid_wqhdr_remove(swq, s_wridlist);
2544		mutex_exit(&swq->wq_wrid_wql->wql_lock);
2545		if (create_new_swq) {
2546			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2547		}
2548		if (create_new_rwq) {
2549			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2550		}
2551
2552#ifdef __lock_lint
2553		mutex_exit(&qp->qp_srqhdl->srq_lock);
2554#else
2555		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2556			mutex_exit(&qp->qp_srqhdl->srq_lock);
2557		}
2558#endif
2559
2560		tavor_wrid_wqhdr_unlock_both(qp);
2561		TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2562		    TAVOR_TNF_ERROR, "");
2563		TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2564		return (ibc_get_ci_failure(0));
2565	}
2566	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2567
2568	/*
2569	 * Initialize the wridlist
2570	 *
2571	 * In the normal QP case, there is no special initialization needed.
2572	 * We simply setup the wridlist backpointer to be the receive wqhdr
2573	 * (rwq).
2574	 *
2575	 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2576	 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2577	 * and thus potentially shared across multiple QPs with the SRQ.  We
2578	 * also setup the srq_wridlist pointer to be the r_wridlist, and
2579	 * intialize the freelist to an invalid index.  This srq_wridlist
2580	 * pointer is used above on future moves from_reset to let us know that
2581	 * the srq_wridlist has been initialized already.
2582	 *
2583	 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2584	 * free list.
2585	 */
2586	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2587	    qp->qp_srqhdl->srq_wridlist == NULL) {
2588		r_wridlist->wl_srq_en = 1;
2589		r_wridlist->wl_free_list_indx = -1;
2590		qp->qp_srqhdl->srq_wridlist = r_wridlist;
2591
2592		/* Initialize srq wrid free list */
2593		if (qp->qp_srqhdl->srq_is_umap == 0) {
2594			mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2595			tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2596			mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2597		}
2598	} else {
2599		r_wridlist->wl_wqhdr = rwq;
2600	}
2601
2602	/* Chain the WRID list "container" to the workq hdr list */
2603	mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2604	tavor_wrid_wqhdr_add(rwq, r_wridlist);
2605	mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2606
2607#ifdef __lock_lint
2608	mutex_exit(&qp->qp_srqhdl->srq_lock);
2609#else
2610	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2611		mutex_exit(&qp->qp_srqhdl->srq_lock);
2612	}
2613#endif
2614
2615	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2616	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2617	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2618	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2619
2620	tavor_wrid_wqhdr_unlock_both(qp);
2621	TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2622	return (DDI_SUCCESS);
2623}
2624
2625
2626/*
2627 * tavor_wrid_to_reset_handling()
2628 *    Context: Can be called from interrupt or base context.
2629 */
2630void
2631tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2632{
2633	uint_t		free_wqhdr = 0;
2634
2635	TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling);
2636
2637	/*
2638	 * For each of this QP's Work Queues, move the WRID "container" to
2639	 * the "reapable" list.  Although there may still be unpolled
2640	 * entries in these containers, it is not a big deal.  We will not
2641	 * reap the list until either the Poll CQ command detects an empty
2642	 * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2643	 * manipulating the lists.
2644	 */
2645	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2646	tavor_wrid_wqhdr_lock_both(qp);
2647	tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2648
2649	/*
2650	 * Add the receive work queue header on to the reaplist.  But if we are
2651	 * on SRQ, then don't add anything to the reaplist.  Instead we flush
2652	 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2653	 * WQHDR (if needed).  We must hold the WQL for these operations, yet
2654	 * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2655	 * drop WQL before that call.  Then release the CQ WQHDR locks and the
2656	 * CQ lock and return.
2657	 */
2658	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2659
2660		/*
2661		 * Pull off all (if any) entries for this QP from CQ.  This
2662		 * only includes entries that have not yet been polled
2663		 */
2664		mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2665		tavor_cq_srq_entries_flush(state, qp);
2666
2667		/* Remove wridlist from WQHDR */
2668		tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2669		    qp->qp_rq_wqhdr->wq_wrid_post);
2670
2671		/* If wridlist chain is now empty, remove the wqhdr as well */
2672		if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2673			free_wqhdr = 1;
2674		} else {
2675			free_wqhdr = 0;
2676		}
2677
2678		mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2679
2680		/* Free the WQHDR */
2681		if (free_wqhdr) {
2682			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2683		}
2684	} else {
2685		tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2686	}
2687	tavor_wrid_wqhdr_unlock_both(qp);
2688	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2689
2690	TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling);
2691}
2692
2693
2694/*
2695 * tavor_wrid_add_entry()
2696 *    Context: Can be called from interrupt or base context.
2697 */
2698void
2699tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2700    uint_t signaled_dbd)
2701{
2702	tavor_wrid_entry_t	*wre_tmp;
2703	uint32_t		head, tail, size;
2704
2705	TAVOR_TNF_ENTER(tavor_wrid_add_entry);
2706
2707	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2708
2709	/*
2710	 * Find the entry in the container pointed to by the "tail" index.
2711	 * Add all of the relevant information to that entry, including WRID,
2712	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2713	 * and/or doorbelled.
2714	 */
2715	head = wq->wq_wrid_post->wl_head;
2716	tail = wq->wq_wrid_post->wl_tail;
2717	size = wq->wq_wrid_post->wl_size;
2718	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2719	wre_tmp->wr_wrid	  = wrid;
2720	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
2721	wre_tmp->wr_signaled_dbd  = signaled_dbd;
2722
2723	/*
2724	 * Update the "wrid_old_tail" pointer to point to the entry we just
2725	 * inserted into the queue.  By tracking this pointer (the pointer to
2726	 * the most recently inserted entry) it will possible later in the
2727	 * PostSend() and PostRecv() code paths to find the entry that needs
2728	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2729	 * tavor_post_send()).
2730	 */
2731	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2732
2733	/* Update the tail index */
2734	tail = ((tail + 1) & (size - 1));
2735	wq->wq_wrid_post->wl_tail = tail;
2736
2737	/*
2738	 * If the "tail" index has just wrapped over into the "head" index,
2739	 * then we have filled the container.  We use the "full" flag to
2740	 * indicate this condition and to distinguish it from the "empty"
2741	 * condition (where head and tail are also equal).
2742	 */
2743	if (head == tail) {
2744		wq->wq_wrid_post->wl_full = 1;
2745	}
2746	TAVOR_TNF_EXIT(tavor_wrid_add_entry);
2747}
2748
2749/*
2750 * tavor_wrid_add_entry_srq()
2751 * Context: Can be called from interrupt or base context
2752 */
2753void
2754tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2755{
2756	tavor_wrid_entry_t	*wre;
2757	uint64_t		*wl_wqe;
2758	uint32_t		wqe_index;
2759
2760	TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq);
2761
2762	/*
2763	 * Find the next available WQE from the SRQ free_list.  Then update the
2764	 * free_list to point to the next entry
2765	 */
2766	wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2767
2768	wqe_index = srq->srq_wridlist->wl_free_list_indx;
2769
2770	/* ASSERT on impossible wqe_index values */
2771	ASSERT(wqe_index < srq->srq_wq_bufsz);
2772
2773	/*
2774	 * Setup the WRE.
2775	 *
2776	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2777	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2778	 * this information and associate the WRID to the WQE found on the CQE.
2779	 */
2780	wre = &srq->srq_wridlist->wl_wre[wqe_index];
2781	wre->wr_wrid = wrid;
2782	wre->wr_signaled_dbd  = signaled_dbd;
2783
2784	/* Update the free list index */
2785	srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2786	    srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2787
2788	TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq);
2789}
2790
2791
2792/*
2793 * tavor_wrid_get_entry()
2794 *    Context: Can be called from interrupt or base context.
2795 */
2796uint64_t
2797tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2798    tavor_wrid_entry_t *wre)
2799{
2800	tavor_workq_hdr_t	*wq;
2801	tavor_wrid_entry_t	*wre_tmp;
2802	uint64_t		wrid;
2803	uint_t			send_or_recv, qpnum, error, opcode;
2804
2805	TAVOR_TNF_ENTER(tavor_wrid_get_entry);
2806
2807	/* Lock the list of work queues associated with this CQ */
2808	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2809
2810	/*
2811	 * Determine whether this CQE is a send or receive completion (and
2812	 * whether it was a "successful" completion or not)
2813	 */
2814	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2815	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2816	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2817		error = 1;
2818		send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2819		    TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2820	} else {
2821		error = 0;
2822		send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2823	}
2824
2825	/* Find the work queue for this QP number (send or receive side) */
2826	qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2827	wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2828	ASSERT(wq != NULL);
2829
2830	/*
2831	 * Regardless of whether the completion is the result of a "success"
2832	 * or a "failure", we lock the list of "containers" and attempt to
2833	 * search for the the first matching completion (i.e. the first WR
2834	 * with a matching WQE addr and size).  Once we find it, we pull out
2835	 * the "wrid" field and return it (see below).  Note: One possible
2836	 * future enhancement would be to enable this routine to skip over
2837	 * any "unsignaled" completions to go directly to the next "signaled"
2838	 * entry on success. XXX
2839	 */
2840	mutex_enter(&wq->wq_wrid_wql->wql_lock);
2841	wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2842
2843	/*
2844	 * If this is a "successful" completion, then we assert that this
2845	 * completion must be a "signaled" completion.
2846	 */
2847	ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2848
2849	/*
2850	 * If the completion is a "failed" completion, then we save away the
2851	 * contents of the entry (into the "wre" field passed in) for use
2852	 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2853	 * function to grab "wqeaddrsz" from the next entry in the container.
2854	 * This is required for error processing (where updating these fields
2855	 * properly is necessary to correct handling of the "error" CQE)
2856	 */
2857	if (error && (wre != NULL)) {
2858		*wre = *wre_tmp;
2859		wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2860	}
2861
2862	/* Pull out the WRID and return it */
2863	wrid = wre_tmp->wr_wrid;
2864
2865	mutex_exit(&wq->wq_wrid_wql->wql_lock);
2866	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2867
2868	TAVOR_TNF_EXIT(tavor_wrid_get_entry);
2869	return (wrid);
2870}
2871
2872
2873/*
2874 * tavor_wrid_find_match()
2875 *    Context: Can be called from interrupt or base context.
2876 */
2877static tavor_wrid_entry_t *
2878tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2879    tavor_hw_cqe_t *cqe)
2880{
2881	tavor_wrid_entry_t	*curr = NULL;
2882	tavor_wrid_list_hdr_t	*container;
2883	uint32_t		wqeaddr_size;
2884	uint32_t		head, tail, size;
2885	int			found = 0, last_container;
2886
2887	TAVOR_TNF_ENTER(tavor_wrid_find_match);
2888
2889	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2890
2891	/* Pull the "wqeaddrsz" information from the CQE */
2892	wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2893
2894	/*
2895	 * Walk the "containers" list(s), find first WR with a matching WQE
2896	 * addr.  If the current "container" is not the last one on the list,
2897	 * i.e. not the current one to which we are posting new WRID entries,
2898	 * then we do not attempt to update the "q_head", "q_tail", and
2899	 * "q_full" indicators on the main work queue header.  We do, however,
2900	 * update the "head" and "full" indicators on the individual containers
2901	 * as we go.  This is imperative because we need to be able to
2902	 * determine when the current container has been emptied (so that we
2903	 * can move on to the next container).
2904	 */
2905	container = wq->wq_wrid_poll;
2906	while (container != NULL) {
2907		/* Is this the last/only "container" on the list */
2908		last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2909
2910		/*
2911		 * First check if we are on an SRQ.  If so, we grab the entry
2912		 * and break out.  Since SRQ wridlist's are never added to
2913		 * reaplist, they can only be the last container.
2914		 */
2915		if (container->wl_srq_en) {
2916			ASSERT(last_container == 1);
2917			curr = tavor_wrid_find_match_srq(container, cq, cqe);
2918			break;
2919		}
2920
2921		/*
2922		 * Grab the current "head", "tail" and "size" fields before
2923		 * walking the list in the current container. Note: the "size"
2924		 * field here must always be a power-of-2.  The "full"
2925		 * parameter is checked (and updated) here to distinguish the
2926		 * "queue full" condition from "queue empty".
2927		 */
2928		head = container->wl_head;
2929		tail = container->wl_tail;
2930		size = container->wl_size;
2931		while ((head != tail) || (container->wl_full)) {
2932			container->wl_full = 0;
2933			curr = &container->wl_wre[head];
2934			head = ((head + 1) & (size - 1));
2935
2936			/*
2937			 * If the current entry's "wqeaddrsz" matches the one
2938			 * we're searching for, then this must correspond to
2939			 * the work request that caused the completion.  Set
2940			 * the "found" flag and bail out.
2941			 */
2942			if (curr->wr_wqeaddrsz == wqeaddr_size) {
2943				found = 1;
2944				break;
2945			}
2946		}
2947
2948		/*
2949		 * If the current container is empty (having reached here the
2950		 * "head == tail" condition can only mean that the container
2951		 * is empty), then NULL out the "wrid_old_tail" field (see
2952		 * tavor_post_send() and tavor_post_recv() for more details)
2953		 * and (potentially) remove the current container from future
2954		 * searches.
2955		 */
2956		if (head == tail) {
2957
2958			container->wl_wre_old_tail = NULL;
2959			/*
2960			 * If this wasn't the last "container" on the chain,
2961			 * i.e. the one to which new WRID entries will be
2962			 * added, then remove it from the list.
2963			 * Note: we don't "lose" the memory pointed to by this
2964			 * because we should have already put this container
2965			 * on the "reapable" list (from where it will later be
2966			 * pulled).
2967			 */
2968			if (!last_container) {
2969				wq->wq_wrid_poll = container->wl_next;
2970			}
2971		}
2972
2973		/* Update the head index for the container */
2974		container->wl_head = head;
2975
2976		/*
2977		 * If the entry was found in this container, then continue to
2978		 * bail out.  Else reset the "curr" pointer and move on to the
2979		 * next container (if there is one).  Note: the only real
2980		 * reason for setting "curr = NULL" here is so that the ASSERT
2981		 * below can catch the case where no matching entry was found
2982		 * on any of the lists.
2983		 */
2984		if (found) {
2985			break;
2986		} else {
2987			curr = NULL;
2988			container = container->wl_next;
2989		}
2990	}
2991
2992	/*
2993	 * Update work queue header's "head" and "full" conditions to match
2994	 * the last entry on the container list.  (Note: Only if we're pulling
2995	 * entries from the last work queue portion of the list, i.e. not from
2996	 * the previous portions that may be the "reapable" list.)
2997	 */
2998	if (last_container) {
2999		wq->wq_head = wq->wq_wrid_post->wl_head;
3000		wq->wq_full = wq->wq_wrid_post->wl_full;
3001	}
3002
3003	/* Ensure that we've actually found what we were searching for */
3004	ASSERT(curr != NULL);
3005
3006	TAVOR_TNF_EXIT(tavor_wrid_find_match);
3007	return (curr);
3008}
3009
3010
3011/*
3012 * tavor_wrid_find_match_srq()
3013 *    Context: Can be called from interrupt or base context.
3014 */
3015tavor_wrid_entry_t *
3016tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
3017    tavor_hw_cqe_t *cqe)
3018{
3019	tavor_wrid_entry_t	*wre;
3020	uint64_t		*wl_wqe;
3021	uint32_t		wqe_index;
3022	uint64_t		wqe_addr;
3023	uint32_t		cqe_wqe_addr;
3024
3025	/* Grab the WQE addr out of the CQE */
3026	cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
3027
3028	/*
3029	 * Use the WQE addr as the lower 32-bit, we add back on the
3030	 * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
3031	 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
3032	 * the SRQ Work Queue itself.  We use this address as the index to find
3033	 * out which Work Queue Entry this CQE corresponds with.
3034	 *
3035	 * We also use this address below to add the WQE back on to the free
3036	 * list.
3037	 */
3038	wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
3039	    (cqe_wqe_addr + wl->wl_srq_desc_off);
3040
3041	/*
3042	 * Given the 'wqe_addr' just calculated and the srq buf address, we
3043	 * find the 'wqe_index'.  The 'wre' returned below contains the WRID
3044	 * that we are looking for.  This indexes into the wre_list for this
3045	 * specific WQE.
3046	 */
3047	wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
3048	    wl->wl_srq_log_wqesz);
3049
3050	/* ASSERT on impossible wqe_index values */
3051	ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
3052
3053	/* Get the pointer to this WQE */
3054	wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
3055
3056	/* Put this WQE index back on the free list */
3057	ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
3058	wl->wl_free_list_indx = wqe_index;
3059
3060	/* Using the index, return the Work Request ID Entry (wre) */
3061	wre = &wl->wl_wre[wqe_index];
3062
3063	return (wre);
3064}
3065
3066
3067/*
3068 * tavor_wrid_cq_reap()
3069 *    Context: Can be called from interrupt or base context.
3070 */
3071void
3072tavor_wrid_cq_reap(tavor_cqhdl_t cq)
3073{
3074	tavor_workq_hdr_t	*consume_wqhdr;
3075	tavor_wrid_list_hdr_t	*container, *to_free;
3076
3077	ASSERT(MUTEX_HELD(&cq->cq_lock));
3078
3079	TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3080
3081	/* Lock the list of work queues associated with this CQ */
3082	mutex_enter(&cq->cq_wrid_wqhdr_lock);
3083
3084	/* Walk the "reapable" list and free up containers */
3085	container = cq->cq_wrid_reap_head;
3086	while (container != NULL) {
3087		to_free	  = container;
3088		container = container->wl_reap_next;
3089		/*
3090		 * If reaping the WRID list containers pulls the last
3091		 * container from the given work queue header, then we free
3092		 * the work queue header as well.
3093		 */
3094		consume_wqhdr = tavor_wrid_list_reap(to_free);
3095		if (consume_wqhdr != NULL) {
3096			tavor_cq_wqhdr_remove(cq, consume_wqhdr);
3097		}
3098	}
3099
3100	/* Once finished reaping, we reset the CQ's reap list */
3101	cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
3102
3103	mutex_exit(&cq->cq_wrid_wqhdr_lock);
3104	TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3105}
3106
3107
3108/*
3109 * tavor_wrid_cq_force_reap()
3110 *    Context: Can be called from interrupt or base context.
3111 */
3112void
3113tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
3114{
3115	tavor_workq_hdr_t	*curr;
3116	tavor_wrid_list_hdr_t	*container, *to_free;
3117	avl_tree_t		*treep;
3118	void			*cookie = NULL;
3119
3120	ASSERT(MUTEX_HELD(&cq->cq_lock));
3121
3122	TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3123
3124	/*
3125	 * The first step is to walk the "reapable" list and free up those
3126	 * containers.  This is necessary because the containers on the
3127	 * reapable list are not otherwise connected to the work queue headers
3128	 * anymore.
3129	 */
3130	tavor_wrid_cq_reap(cq);
3131
3132	/* Now lock the list of work queues associated with this CQ */
3133	mutex_enter(&cq->cq_wrid_wqhdr_lock);
3134
3135	/*
3136	 * Walk the list of work queue headers and free up all the WRID list
3137	 * containers chained to it.  Note: We don't need to grab the locks
3138	 * for each of the individual WRID lists here because the only way
3139	 * things can be added or removed from the list at this point would be
3140	 * through post a work request to a QP.  But if we've come this far,
3141	 * then we can be assured that there are no longer any QP associated
3142	 * with the CQ that we are trying to free.
3143	 */
3144#ifdef __lock_lint
3145	tavor_wrid_wqhdr_compare(NULL, NULL);
3146#endif
3147	treep = &cq->cq_wrid_wqhdr_avl_tree;
3148	while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
3149		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
3150		container = curr->wq_wrid_poll;
3151		while (container != NULL) {
3152			to_free	  = container;
3153			container = container->wl_next;
3154			/*
3155			 * If reaping the WRID list containers pulls the last
3156			 * container from the given work queue header, then
3157			 * we free the work queue header as well.  Note: we
3158			 * ignore the return value because we know that the
3159			 * work queue header should always be freed once the
3160			 * list of containers has come to an end.
3161			 */
3162			(void) tavor_wrid_list_reap(to_free);
3163			if (container == NULL) {
3164				tavor_cq_wqhdr_remove(cq, curr);
3165			}
3166		}
3167	}
3168	avl_destroy(treep);
3169
3170	mutex_exit(&cq->cq_wrid_wqhdr_lock);
3171	TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3172}
3173
3174
3175/*
3176 * tavor_wrid_get_list()
3177 *    Context: Can be called from interrupt or base context.
3178 */
3179tavor_wrid_list_hdr_t *
3180tavor_wrid_get_list(uint32_t qsize)
3181{
3182	tavor_wrid_list_hdr_t	*wridlist;
3183	uint32_t		size;
3184
3185	/*
3186	 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
3187	 * which holds the pointers necessary for maintaining the "reapable"
3188	 * list, chaining together multiple "containers" old and new, and
3189	 * tracking the head, tail, size, etc. for each container.
3190	 *
3191	 * The "container" also holds all the tavor_wrid_entry_t's, which is
3192	 * allocated separately, one for each entry on the corresponding work
3193	 * queue.
3194	 */
3195	size = sizeof (tavor_wrid_list_hdr_t);
3196
3197	/*
3198	 * Note that this allocation has to be a NOSLEEP operation here
3199	 * because we are holding the "wqhdr_list_lock" and, therefore,
3200	 * could get raised to the interrupt level.
3201	 */
3202	wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
3203	if (wridlist == NULL) {
3204		return (NULL);
3205	}
3206	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
3207
3208	/* Complete the "container" initialization */
3209	wridlist->wl_size = qsize;
3210	wridlist->wl_full = 0;
3211	wridlist->wl_head = 0;
3212	wridlist->wl_tail = 0;
3213	wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3214	    sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3215	if (wridlist->wl_wre == NULL) {
3216		kmem_free(wridlist, size);
3217		return (NULL);
3218	}
3219	wridlist->wl_wre_old_tail  = NULL;
3220	wridlist->wl_reap_next = NULL;
3221	wridlist->wl_next  = NULL;
3222	wridlist->wl_prev  = NULL;
3223	wridlist->wl_srq_en = 0;
3224
3225	return (wridlist);
3226}
3227
3228/*
3229 * tavor_wrid_list_srq_init()
3230 * Context: Can be called from interrupt or base context
3231 */
3232void
3233tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3234    uint_t wq_start)
3235{
3236	uint64_t *wl_wqe;
3237	int wqe_index;
3238
3239	ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3240
3241	/* Setup pointers for use later when we are polling the CQ */
3242	wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3243	wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3244	wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3245	wridlist->wl_srq_desc_off = srq->srq_desc_off;
3246	wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3247
3248	/* Given wq_start to start initializing buf at, verify sanity */
3249	ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3250
3251	/*
3252	 * Initialize wridlist free list
3253	 *
3254	 * For each WQ up to the size of our queue, we store an index in the WQ
3255	 * memory itself, representing the next available free entry.  The
3256	 * 'wl_free_list_indx' always holds the index of the next available
3257	 * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3258	 * completely full.  This gives us the advantage of being able to have
3259	 * entries complete or be polled off the WQ out-of-order.
3260	 *
3261	 * For now, we write the free_list entries inside the WQ itself.  It
3262	 * may be useful in the future to store this information in a separate
3263	 * structure for debugging purposes.
3264	 */
3265	for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3266		wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3267		ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3268		    wridlist->wl_free_list_indx);
3269		wridlist->wl_free_list_indx = wqe_index;
3270	}
3271}
3272
3273
3274/*
3275 * tavor_wrid_reaplist_add()
3276 *    Context: Can be called from interrupt or base context.
3277 */
3278static void
3279tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3280{
3281	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3282
3283	TAVOR_TNF_ENTER(tavor_wrid_reaplist_add);
3284
3285	mutex_enter(&wq->wq_wrid_wql->wql_lock);
3286
3287	/*
3288	 * Add the "post" container (the last one on the current chain) to
3289	 * the CQ's "reapable" list
3290	 */
3291	if ((cq->cq_wrid_reap_head == NULL) &&
3292	    (cq->cq_wrid_reap_tail == NULL)) {
3293		cq->cq_wrid_reap_head = wq->wq_wrid_post;
3294		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3295	} else {
3296		cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3297		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3298	}
3299
3300	mutex_exit(&wq->wq_wrid_wql->wql_lock);
3301}
3302
3303
3304int
3305tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3306{
3307	tavor_workq_compare_t	*cmpp;
3308	tavor_workq_hdr_t	*curr;
3309
3310	cmpp = (tavor_workq_compare_t *)p1;
3311	curr = (tavor_workq_hdr_t *)p2;
3312
3313	if (cmpp->cmp_qpn < curr->wq_qpn)
3314		return (-1);
3315	else if (cmpp->cmp_qpn > curr->wq_qpn)
3316		return (+1);
3317	else if (cmpp->cmp_type < curr->wq_type)
3318		return (-1);
3319	else if (cmpp->cmp_type > curr->wq_type)
3320		return (+1);
3321	else
3322		return (0);
3323}
3324
3325
3326/*
3327 * tavor_wrid_wqhdr_find()
3328 *    Context: Can be called from interrupt or base context.
3329 */
3330static tavor_workq_hdr_t *
3331tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3332{
3333	tavor_workq_hdr_t	*curr;
3334	tavor_workq_compare_t	cmp;
3335
3336	TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find);
3337
3338	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3339
3340	/*
3341	 * Walk the CQ's work queue list, trying to find a send or recv queue
3342	 * with the same QP number.  We do this even if we are going to later
3343	 * create a new entry because it helps us easily find the end of the
3344	 * list.
3345	 */
3346	cmp.cmp_qpn = qpn;
3347	cmp.cmp_type = wq_type;
3348#ifdef __lock_lint
3349	tavor_wrid_wqhdr_compare(NULL, NULL);
3350#endif
3351	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3352
3353	TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find);
3354	return (curr);
3355}
3356
3357
3358/*
3359 * tavor_wrid_wqhdr_create()
3360 *    Context: Can be called from interrupt or base context.
3361 */
3362static tavor_workq_hdr_t *
3363tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3364    uint_t wq_type, uint_t create_wql)
3365{
3366	tavor_workq_hdr_t	*wqhdr_tmp;
3367
3368	TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create);
3369
3370	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3371
3372	/*
3373	 * Allocate space a work queue header structure and initialize it.
3374	 * Each work queue header structure includes a "wq_wrid_wql"
3375	 * which needs to be initialized.  Note that this allocation has to be
3376	 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3377	 * and, therefore, could get raised to the interrupt level.
3378	 */
3379	wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3380	    sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3381	if (wqhdr_tmp == NULL) {
3382		TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3383		return (NULL);
3384	}
3385	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3386	wqhdr_tmp->wq_qpn	= qpn;
3387	wqhdr_tmp->wq_type	= wq_type;
3388
3389	if (create_wql) {
3390		wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3391		if (wqhdr_tmp->wq_wrid_wql == NULL) {
3392			kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3393			TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3394			return (NULL);
3395		}
3396	}
3397
3398	wqhdr_tmp->wq_wrid_poll = NULL;
3399	wqhdr_tmp->wq_wrid_post = NULL;
3400
3401	/* Chain the newly allocated work queue header to the CQ's list */
3402	tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3403
3404	TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3405	return (wqhdr_tmp);
3406}
3407
3408
3409/*
3410 * tavor_wrid_wql_create()
3411 *    Context: Can be called from interrupt or base context.
3412 */
3413tavor_wq_lock_t *
3414tavor_wrid_wql_create(tavor_state_t *state)
3415{
3416	tavor_wq_lock_t *wql;
3417
3418	TAVOR_TNF_ENTER(tavor_wrid_wql_create);
3419
3420	/*
3421	 * Allocate the WQL and initialize it.
3422	 */
3423	wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3424	if (wql == NULL) {
3425		TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3426		return (NULL);
3427	}
3428
3429	mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3430	    DDI_INTR_PRI(state->ts_intrmsi_pri));
3431
3432	/* Add refcount to WQL */
3433	tavor_wql_refcnt_inc(wql);
3434
3435	TAVOR_TNF_EXIT(tavor_wrid_wql_create);
3436	return (wql);
3437}
3438
3439
3440/*
3441 * tavor_wrid_get_wqeaddrsz()
3442 *    Context: Can be called from interrupt or base context.
3443 */
3444static uint32_t
3445tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3446{
3447	tavor_wrid_entry_t	*wre;
3448	uint32_t		wqeaddrsz;
3449	uint32_t		head;
3450
3451	/*
3452	 * If the container is empty, then there is no next entry. So just
3453	 * return zero.  Note: the "head == tail" condition here can only
3454	 * mean that the container is empty because we have previously pulled
3455	 * something from the container.
3456	 *
3457	 * If the container is not empty, then find the next entry and return
3458	 * the contents of its "wqeaddrsz" field.
3459	 */
3460	if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3461		wqeaddrsz = 0;
3462	} else {
3463		/*
3464		 * We don't need to calculate the "next" head pointer here
3465		 * because "head" should already point to the next entry on
3466		 * the list (since we just pulled something off - in
3467		 * tavor_wrid_find_match() - and moved the head index forward.)
3468		 */
3469		head = wq->wq_wrid_poll->wl_head;
3470		wre = &wq->wq_wrid_poll->wl_wre[head];
3471		wqeaddrsz = wre->wr_wqeaddrsz;
3472	}
3473	return (wqeaddrsz);
3474}
3475
3476
3477/*
3478 * tavor_wrid_wqhdr_add()
3479 *    Context: Can be called from interrupt or base context.
3480 */
3481static void
3482tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3483    tavor_wrid_list_hdr_t *wridlist)
3484{
3485	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3486
3487	/* Chain the new WRID list "container" to the work queue list */
3488	if ((wqhdr->wq_wrid_post == NULL) &&
3489	    (wqhdr->wq_wrid_poll == NULL)) {
3490		wqhdr->wq_wrid_poll = wridlist;
3491		wqhdr->wq_wrid_post = wridlist;
3492	} else {
3493		wqhdr->wq_wrid_post->wl_next = wridlist;
3494		wridlist->wl_prev = wqhdr->wq_wrid_post;
3495		wqhdr->wq_wrid_post = wridlist;
3496	}
3497}
3498
3499
3500/*
3501 * tavor_wrid_wqhdr_remove()
3502 *    Context: Can be called from interrupt or base context.
3503 *
3504 *    Note: this is only called to remove the most recently added WRID list
3505 *    container (i.e. in tavor_from_reset() above)
3506 */
3507static void
3508tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3509    tavor_wrid_list_hdr_t *wridlist)
3510{
3511	tavor_wrid_list_hdr_t	*prev, *next;
3512
3513	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3514
3515	/* Unlink the WRID list "container" from the work queue list */
3516	prev = wridlist->wl_prev;
3517	next = wridlist->wl_next;
3518	if (prev != NULL) {
3519		prev->wl_next = next;
3520	}
3521	if (next != NULL) {
3522		next->wl_prev = prev;
3523	}
3524
3525	/*
3526	 * Update any pointers in the work queue hdr that may point to this
3527	 * WRID list container
3528	 */
3529	if (wqhdr->wq_wrid_post == wridlist) {
3530		wqhdr->wq_wrid_post = prev;
3531	}
3532	if (wqhdr->wq_wrid_poll == wridlist) {
3533		wqhdr->wq_wrid_poll = NULL;
3534	}
3535}
3536
3537
3538/*
3539 * tavor_wrid_list_reap()
3540 *    Context: Can be called from interrupt or base context.
3541 *    Note: The "wqhdr_list_lock" must be held.
3542 */
3543static tavor_workq_hdr_t *
3544tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3545{
3546	tavor_workq_hdr_t	*wqhdr, *consume_wqhdr = NULL;
3547	tavor_wrid_list_hdr_t	*prev, *next;
3548	uint32_t		size;
3549
3550	TAVOR_TNF_ENTER(tavor_wrid_list_reap);
3551
3552	/* Get the back pointer to the work queue header (see below) */
3553	wqhdr = wridlist->wl_wqhdr;
3554	mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3555
3556	/* Unlink the WRID list "container" from the work queue list */
3557	prev = wridlist->wl_prev;
3558	next = wridlist->wl_next;
3559	if (prev != NULL) {
3560		prev->wl_next = next;
3561	}
3562	if (next != NULL) {
3563		next->wl_prev = prev;
3564	}
3565
3566	/*
3567	 * If the back pointer to the work queue header shows that it
3568	 * was pointing to the entry we are about to remove, then the work
3569	 * queue header is reapable as well.
3570	 */
3571	if ((wqhdr->wq_wrid_poll == wridlist) &&
3572	    (wqhdr->wq_wrid_post == wridlist)) {
3573		consume_wqhdr = wqhdr;
3574	}
3575
3576	/* Be sure to update the "poll" and "post" container pointers */
3577	if (wqhdr->wq_wrid_poll == wridlist) {
3578		wqhdr->wq_wrid_poll = next;
3579	}
3580	if (wqhdr->wq_wrid_post == wridlist) {
3581		wqhdr->wq_wrid_post = NULL;
3582	}
3583
3584	/* Calculate the size and free the container */
3585	size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3586	kmem_free(wridlist->wl_wre, size);
3587	kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3588
3589	mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3590
3591	TAVOR_TNF_EXIT(tavor_wrid_list_reap);
3592	return (consume_wqhdr);
3593}
3594
3595
3596/*
3597 * tavor_wrid_wqhdr_lock_both()
3598 *    Context: Can be called from interrupt or base context.
3599 */
3600static void
3601tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3602{
3603	tavor_cqhdl_t	sq_cq, rq_cq;
3604
3605	sq_cq = qp->qp_sq_cqhdl;
3606	rq_cq = qp->qp_rq_cqhdl;
3607
3608_NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3609_NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3610
3611	/*
3612	 * If both work queues (send and recv) share a completion queue, then
3613	 * grab the common lock.  If they use different CQs (hence different
3614	 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3615	 * receive.  We do this consistently and correctly in
3616	 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3617	 * of dead lock condition.  Note:  We add the "__lock_lint" code here
3618	 * to fake out warlock into thinking we've grabbed both locks (when,
3619	 * in fact, we only needed the one).
3620	 */
3621	if (sq_cq == rq_cq) {
3622		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3623#ifdef	__lock_lint
3624		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3625#endif
3626	} else {
3627		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3628		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3629	}
3630}
3631
3632/*
3633 * tavor_wrid_wqhdr_unlock_both()
3634 *    Context: Can be called from interrupt or base context.
3635 */
3636static void
3637tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3638{
3639	tavor_cqhdl_t	sq_cq, rq_cq;
3640
3641	sq_cq = qp->qp_sq_cqhdl;
3642	rq_cq = qp->qp_rq_cqhdl;
3643
3644_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3645_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3646
3647	/*
3648	 * See tavor_wrid_wqhdr_lock_both() above for more detail
3649	 */
3650	if (sq_cq == rq_cq) {
3651#ifdef	__lock_lint
3652		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3653#endif
3654		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3655	} else {
3656		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3657		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3658	}
3659}
3660
3661
3662/*
3663 * tavor_cq_wqhdr_add()
3664 *    Context: Can be called from interrupt or base context.
3665 */
3666static void
3667tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3668{
3669	tavor_workq_compare_t	cmp;
3670	avl_index_t		where;
3671
3672	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3673
3674	cmp.cmp_qpn = wqhdr->wq_qpn;
3675	cmp.cmp_type = wqhdr->wq_type;
3676#ifdef __lock_lint
3677	tavor_wrid_wqhdr_compare(NULL, NULL);
3678#endif
3679	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3680	/*
3681	 * If the CQ's work queue list is empty, then just add it.
3682	 * Otherwise, chain it to the beginning of the list.
3683	 */
3684	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3685}
3686
3687
3688/*
3689 * tavor_cq_wqhdr_remove()
3690 *    Context: Can be called from interrupt or base context.
3691 */
3692static void
3693tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3694{
3695	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3696
3697#ifdef __lock_lint
3698	tavor_wrid_wqhdr_compare(NULL, NULL);
3699#endif
3700	/* Remove "wqhdr" from the work queue header list on "cq" */
3701	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3702
3703	/*
3704	 * Release reference to WQL; If this is the last reference, this call
3705	 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3706	 */
3707	tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3708
3709	/* Free the memory associated with "wqhdr" */
3710	kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3711}
3712
3713
3714/*
3715 * tavor_wql_refcnt_inc()
3716 * Context: Can be called from interrupt or base context
3717 */
3718void
3719tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3720{
3721	ASSERT(wql != NULL);
3722
3723	mutex_enter(&wql->wql_lock);
3724	wql->wql_refcnt++;
3725	mutex_exit(&wql->wql_lock);
3726}
3727
3728/*
3729 * tavor_wql_refcnt_dec()
3730 * Context: Can be called from interrupt or base context
3731 */
3732void
3733tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3734{
3735	int	refcnt;
3736
3737	ASSERT(wql != NULL);
3738
3739	mutex_enter(&wql->wql_lock);
3740	wql->wql_refcnt--;
3741	refcnt = wql->wql_refcnt;
3742	mutex_exit(&wql->wql_lock);
3743
3744	/*
3745	 *
3746	 * Free up WQL memory if we're the last one associated with this
3747	 * structure.
3748	 */
3749	if (refcnt == 0) {
3750		mutex_destroy(&wql->wql_lock);
3751		kmem_free(wql, sizeof (tavor_wq_lock_t));
3752	}
3753}
3754