tavor_cq.c revision 9517:b4839b0aa7a4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * tavor_cq.c
29 *    Tavor Completion Queue Processing Routines
30 *
31 *    Implements all the routines necessary for allocating, freeing, resizing,
32 *    and handling the completion type events that the Tavor hardware can
33 *    generate.
34 */
35
36#include <sys/types.h>
37#include <sys/conf.h>
38#include <sys/ddi.h>
39#include <sys/sunddi.h>
40#include <sys/modctl.h>
41#include <sys/bitmap.h>
42#include <sys/sysmacros.h>
43
44#include <sys/ib/adapters/tavor/tavor.h>
45
46/*
47 * Used by tavor_cq_numcalc() below to fill in the "unconstrained" portion
48 * of Tavor completion queue number
49 */
50static uint_t tavor_debug_cqnum_cnt = 0x00000000;
51
52static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
53    uint32_t cqn, uint32_t cq_param);
54#pragma inline(tavor_cq_doorbell)
55static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
56    tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
57static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
58    tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
59static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
60    uint_t flag);
61static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
62    uint32_t old_cons_indx, uint32_t num_newcqe);
63static void tavor_cq_numcalc(tavor_state_t *state, uint32_t indx,
64    uint32_t *key);
65
66/*
67 * tavor_cq_alloc()
68 *    Context: Can be called only from user or kernel context.
69 */
70int
71tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
72    ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
73    uint_t sleepflag)
74{
75	tavor_rsrc_t		*cqc, *rsrc;
76	tavor_umap_db_entry_t	*umapdb;
77	tavor_hw_cqc_t		cqc_entry;
78	tavor_cqhdl_t		cq;
79	ibt_mr_attr_t		mr_attr;
80	tavor_mr_options_t	op;
81	tavor_pdhdl_t		pd;
82	tavor_mrhdl_t		mr;
83	tavor_hw_cqe_t		*buf;
84	uint64_t		addr, value;
85	uint32_t		log_cq_size, lkey, uarpg;
86	uint_t			dma_xfer_mode, cq_sync, cq_is_umap;
87	int			status, i, flag;
88	char			*errormsg;
89
90	TAVOR_TNF_ENTER(tavor_cq_alloc);
91
92	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
93
94	/*
95	 * Determine whether CQ is being allocated for userland access or
96	 * whether it is being allocated for kernel access.  If the CQ is
97	 * being allocated for userland access, then lookup the UAR doorbell
98	 * page number for the current process.  Note:  If this is not found
99	 * (e.g. if the process has not previously open()'d the Tavor driver),
100	 * then an error is returned.
101	 */
102	cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
103	if (cq_is_umap) {
104		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
105		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
106		if (status != DDI_SUCCESS) {
107			/* Set "status" and "errormsg" and goto failure */
108			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
109			goto cqalloc_fail;
110		}
111		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
112	}
113
114	/* Use the internal protection domain (PD) for setting up CQs */
115	pd = state->ts_pdhdl_internal;
116
117	/* Increment the reference count on the protection domain (PD) */
118	tavor_pd_refcnt_inc(pd);
119
120	/*
121	 * Allocate an CQ context entry.  This will be filled in with all
122	 * the necessary parameters to define the Completion Queue.  And then
123	 * ownership will be passed to the hardware in the final step
124	 * below.  If we fail here, we must undo the protection domain
125	 * reference count.
126	 */
127	status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
128	if (status != DDI_SUCCESS) {
129		/* Set "status" and "errormsg" and goto failure */
130		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
131		goto cqalloc_fail1;
132	}
133
134	/*
135	 * Allocate the software structure for tracking the completion queue
136	 * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
137	 * undo the protection domain reference count and the previous
138	 * resource allocation.
139	 */
140	status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
141	if (status != DDI_SUCCESS) {
142		/* Set "status" and "errormsg" and goto failure */
143		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
144		goto cqalloc_fail2;
145	}
146	cq = (tavor_cqhdl_t)rsrc->tr_addr;
147	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
148	cq->cq_is_umap = cq_is_umap;
149
150	/*
151	 * Calculate the CQ number from CQC index.  In much the same way
152	 * as we create keys for memory regions (see tavor_mr.c), this CQ
153	 * number is constructed from a "constrained" portion (which depends
154	 * on the CQC index) and an "unconstrained" portion (which is
155	 * arbitrarily chosen).
156	 */
157	tavor_cq_numcalc(state, cqc->tr_indx, &cq->cq_cqnum);
158
159	/*
160	 * If this will be a user-mappable CQ, then allocate an entry for
161	 * the "userland resources database".  This will later be added to
162	 * the database (after all further CQ operations are successful).
163	 * If we fail here, we must undo the reference counts and the
164	 * previous resource allocation.
165	 */
166	if (cq->cq_is_umap) {
167		umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
168		    MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
169		if (umapdb == NULL) {
170			/* Set "status" and "errormsg" and goto failure */
171			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
172			goto cqalloc_fail3;
173		}
174	}
175
176	/*
177	 * Calculate the appropriate size for the completion queue.
178	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
179	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
180	 * to round the requested size up to the next highest power-of-2
181	 */
182	cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
183	log_cq_size = highbit(cq_attr->cq_size);
184
185	/*
186	 * Next we verify that the rounded-up size is valid (i.e. consistent
187	 * with the device limits and/or software-configured limits)
188	 */
189	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
190		/* Set "status" and "errormsg" and goto failure */
191		TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
192		goto cqalloc_fail4;
193	}
194
195	/*
196	 * Allocate the memory for Completion Queue.
197	 *
198	 * Note: Although we use the common queue allocation routine, we
199	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
200	 * kernel system memory) for kernel CQs because it would be
201	 * inefficient to have CQs located in DDR memory.  This is primarily
202	 * because CQs are read from (by software) more than they are written
203	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
204	 * user-mappable CQs for a similar reason.)
205	 * It is also worth noting that, unlike Tavor QP work queues,
206	 * completion queues do not have the same strict alignment
207	 * requirements.  It is sufficient for the CQ memory to be both
208	 * aligned to and bound to addresses which are a multiple of CQE size.
209	 */
210	cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
211	cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
212	cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
213	if (cq->cq_is_umap) {
214		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
215	} else {
216		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
217	}
218	status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
219	if (status != DDI_SUCCESS) {
220		/* Set "status" and "errormsg" and goto failure */
221		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
222		goto cqalloc_fail4;
223	}
224	buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
225	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
226
227	/*
228	 * Initialize each of the Completion Queue Entries (CQE) by setting
229	 * their ownership to hardware ("owner" bit set to HW).  This is in
230	 * preparation for the final transfer of ownership (below) of the
231	 * CQ context itself.
232	 */
233	for (i = 0; i < (1 << log_cq_size); i++) {
234		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
235	}
236
237	/*
238	 * Register the memory for the CQ.  The memory for the CQ must
239	 * be registered in the Tavor TPT tables.  This gives us the LKey
240	 * to specify in the CQ context below.  Note: If this is a user-
241	 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
242	 */
243	flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
244	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
245	mr_attr.mr_len	 = cq->cq_cqinfo.qa_size;
246	mr_attr.mr_as	 = NULL;
247	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
248	if (cq->cq_is_umap) {
249		dma_xfer_mode = DDI_DMA_CONSISTENT;
250	} else {
251		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
252	}
253	if (dma_xfer_mode == DDI_DMA_STREAMING) {
254		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
255	}
256	op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
257	op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
258	op.mro_bind_override_addr = 0;
259	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
260	if (status != DDI_SUCCESS) {
261		/* Set "status" and "errormsg" and goto failure */
262		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
263		goto cqalloc_fail5;
264	}
265	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
266	addr = mr->mr_bindinfo.bi_addr;
267	lkey = mr->mr_lkey;
268
269	/* Determine if later ddi_dma_sync will be necessary */
270	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
271
272	/* Sync entire CQ for use by the hardware (if necessary). */
273	if (cq_sync) {
274		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
275		    cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
276	}
277
278	/*
279	 * Fill in the CQC entry.  This is the final step before passing
280	 * ownership of the CQC entry to the Tavor hardware.  We use all of
281	 * the information collected/calculated above to fill in the
282	 * requisite portions of the CQC.  Note: If this CQ is going to be
283	 * used for userland access, then we need to set the UAR page number
284	 * appropriately (otherwise it's a "don't care")
285	 */
286	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
287	cq->cq_eqnum		= TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
288	cq->cq_erreqnum		= TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
289	cqc_entry.xlat		= TAVOR_VA2PA_XLAT_ENABLED;
290	cqc_entry.state		= TAVOR_CQ_DISARMED;
291	cqc_entry.start_addr_h	= (addr >> 32);
292	cqc_entry.start_addr_l	= (addr & 0xFFFFFFFF);
293	cqc_entry.log_cq_sz	= log_cq_size;
294	if (cq->cq_is_umap) {
295		cqc_entry.usr_page = uarpg;
296	} else {
297		cqc_entry.usr_page = 0;
298	}
299	cqc_entry.pd		= pd->pd_pdnum;
300	cqc_entry.lkey		= lkey;
301	cqc_entry.e_eqn		= cq->cq_erreqnum;
302	cqc_entry.c_eqn		= cq->cq_eqnum;
303	cqc_entry.cqn		= cq->cq_cqnum;
304
305	/*
306	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
307	 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
308	 * command).  Note: In general, this operation shouldn't fail.  But
309	 * if it does, we have to undo everything we've done above before
310	 * returning error.
311	 */
312	status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
313	    sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
314	if (status != TAVOR_CMD_SUCCESS) {
315		cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
316		    status);
317		TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
318		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
319		/* Set "status" and "errormsg" and goto failure */
320		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
321		goto cqalloc_fail6;
322	}
323
324	/*
325	 * Fill in the rest of the Tavor Completion Queue handle.  Having
326	 * successfully transferred ownership of the CQC, we can update the
327	 * following fields for use in further operations on the CQ.
328	 */
329	cq->cq_cqcrsrcp	  = cqc;
330	cq->cq_rsrcp	  = rsrc;
331	cq->cq_consindx	  = 0;
332	cq->cq_buf	  = buf;
333	cq->cq_bufsz	  = (1 << log_cq_size);
334	cq->cq_mrhdl	  = mr;
335	cq->cq_sync	  = cq_sync;
336	cq->cq_refcnt	  = 0;
337	cq->cq_is_special = 0;
338	cq->cq_uarpg	  = uarpg;
339	cq->cq_umap_dhp	  = (devmap_cookie_t)NULL;
340	avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
341	    sizeof (struct tavor_workq_hdr_s),
342	    offsetof(struct tavor_workq_hdr_s, wq_avl_link));
343
344	cq->cq_wrid_reap_head  = NULL;
345	cq->cq_wrid_reap_tail  = NULL;
346	cq->cq_hdlrarg	  = (void *)ibt_cqhdl;
347
348	/*
349	 * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
350	 * "actual_size" and "cqhdl" and return success
351	 */
352	ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
353	state->ts_cqhdl[cqc->tr_indx] = cq;
354
355	/*
356	 * If this is a user-mappable CQ, then we need to insert the previously
357	 * allocated entry into the "userland resources database".  This will
358	 * allow for later lookup during devmap() (i.e. mmap()) calls.
359	 */
360	if (cq->cq_is_umap) {
361		tavor_umap_db_add(umapdb);
362	}
363
364	/*
365	 * Fill in the return arguments (if necessary).  This includes the
366	 * real completion queue size.
367	 */
368	if (actual_size != NULL) {
369		*actual_size = (1 << log_cq_size) - 1;
370	}
371	*cqhdl = cq;
372
373	TAVOR_TNF_EXIT(tavor_cq_alloc);
374	return (DDI_SUCCESS);
375
376/*
377 * The following is cleanup for all possible failure cases in this routine
378 */
379cqalloc_fail6:
380	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
381	    sleepflag) != DDI_SUCCESS) {
382		TAVOR_WARNING(state, "failed to deregister CQ memory");
383	}
384cqalloc_fail5:
385	tavor_queue_free(state, &cq->cq_cqinfo);
386cqalloc_fail4:
387	if (cq_is_umap) {
388		tavor_umap_db_free(umapdb);
389	}
390cqalloc_fail3:
391	tavor_rsrc_free(state, &rsrc);
392cqalloc_fail2:
393	tavor_rsrc_free(state, &cqc);
394cqalloc_fail1:
395	tavor_pd_refcnt_dec(pd);
396cqalloc_fail:
397	TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
398	    tnf_string, msg, errormsg);
399	TAVOR_TNF_EXIT(tavor_cq_alloc);
400	return (status);
401}
402
403
404/*
405 * tavor_cq_free()
406 *    Context: Can be called only from user or kernel context.
407 */
408/* ARGSUSED */
409int
410tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
411{
412	tavor_rsrc_t		*cqc, *rsrc;
413	tavor_umap_db_entry_t	*umapdb;
414	tavor_hw_cqc_t		cqc_entry;
415	tavor_pdhdl_t		pd;
416	tavor_mrhdl_t		mr;
417	tavor_cqhdl_t		cq;
418	uint32_t		cqnum;
419	uint64_t		value;
420	uint_t			maxprot;
421	int			status;
422
423	TAVOR_TNF_ENTER(tavor_cq_free);
424
425	/*
426	 * Pull all the necessary information from the Tavor Completion Queue
427	 * handle.  This is necessary here because the resource for the
428	 * CQ handle is going to be freed up as part of this operation.
429	 */
430	cq	= *cqhdl;
431	mutex_enter(&cq->cq_lock);
432	cqc	= cq->cq_cqcrsrcp;
433	rsrc	= cq->cq_rsrcp;
434	pd	= state->ts_pdhdl_internal;
435	mr	= cq->cq_mrhdl;
436	cqnum	= cq->cq_cqnum;
437
438	/*
439	 * If there are work queues still associated with the CQ, then return
440	 * an error.  Otherwise, we will be holding the CQ lock.
441	 */
442	if (cq->cq_refcnt != 0) {
443		mutex_exit(&cq->cq_lock);
444		TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
445		    tnf_int, refcnt, cq->cq_refcnt);
446		TAVOR_TNF_EXIT(tavor_cq_free);
447		return (IBT_CQ_BUSY);
448	}
449
450	/*
451	 * If this was a user-mappable CQ, then we need to remove its entry
452	 * from the "userland resources database".  If it is also currently
453	 * mmap()'d out to a user process, then we need to call
454	 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
455	 * We also need to invalidate the CQ tracking information for the
456	 * user mapping.
457	 */
458	if (cq->cq_is_umap) {
459		status = tavor_umap_db_find(state->ts_instance, cqnum,
460		    MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
461		    &umapdb);
462		if (status != DDI_SUCCESS) {
463			mutex_exit(&cq->cq_lock);
464			TAVOR_WARNING(state, "failed to find in database");
465			TAVOR_TNF_EXIT(tavor_cq_free);
466			return (ibc_get_ci_failure(0));
467		}
468		tavor_umap_db_free(umapdb);
469		if (cq->cq_umap_dhp != NULL) {
470			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
471			status = devmap_devmem_remap(cq->cq_umap_dhp,
472			    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
473			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
474			if (status != DDI_SUCCESS) {
475				mutex_exit(&cq->cq_lock);
476				TAVOR_WARNING(state, "failed in CQ memory "
477				    "devmap_devmem_remap()");
478				TAVOR_TNF_EXIT(tavor_cq_free);
479				return (ibc_get_ci_failure(0));
480			}
481			cq->cq_umap_dhp = (devmap_cookie_t)NULL;
482		}
483	}
484
485	/*
486	 * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
487	 * in-progress events to detect that the CQ corresponding to this
488	 * number has been freed.
489	 */
490	state->ts_cqhdl[cqc->tr_indx] = NULL;
491
492	/*
493	 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
494	 * list.  This cleans up all the structures associated with the WRID
495	 * processing for this CQ.  Once we complete, drop the lock and finish
496	 * the deallocation of the CQ.
497	 */
498	tavor_wrid_cq_force_reap(cq);
499
500	mutex_exit(&cq->cq_lock);
501	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
502
503	/*
504	 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
505	 * firmware command).  If the ownership transfer fails for any reason,
506	 * then it is an indication that something (either in HW or SW) has
507	 * gone seriously wrong.
508	 */
509	status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
510	    sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
511	if (status != TAVOR_CMD_SUCCESS) {
512		TAVOR_WARNING(state, "failed to reclaim CQC ownership");
513		cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
514		    status);
515		TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
516		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
517		TAVOR_TNF_EXIT(tavor_cq_free);
518		return (ibc_get_ci_failure(0));
519	}
520
521	/*
522	 * Deregister the memory for the Completion Queue.  If this fails
523	 * for any reason, then it is an indication that something (either
524	 * in HW or SW) has gone seriously wrong.  So we print a warning
525	 * message and return.
526	 */
527	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
528	    sleepflag);
529	if (status != DDI_SUCCESS) {
530		TAVOR_WARNING(state, "failed to deregister CQ memory");
531		TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
532		TAVOR_TNF_EXIT(tavor_cq_free);
533		return (ibc_get_ci_failure(0));
534	}
535
536	/* Free the memory for the CQ */
537	tavor_queue_free(state, &cq->cq_cqinfo);
538
539	/* Free the Tavor Completion Queue handle */
540	tavor_rsrc_free(state, &rsrc);
541
542	/* Free up the CQC entry resource */
543	tavor_rsrc_free(state, &cqc);
544
545	/* Decrement the reference count on the protection domain (PD) */
546	tavor_pd_refcnt_dec(pd);
547
548	/* Set the cqhdl pointer to NULL and return success */
549	*cqhdl = NULL;
550
551	TAVOR_TNF_EXIT(tavor_cq_free);
552	return (DDI_SUCCESS);
553}
554
555
556/*
557 * tavor_cq_resize()
558 *    Context: Can be called only from user or kernel context.
559 */
560int
561tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
562    uint_t *actual_size, uint_t sleepflag)
563{
564	tavor_hw_cqc_t		cqc_entry;
565	tavor_qalloc_info_t	new_cqinfo, old_cqinfo;
566	ibt_mr_attr_t		mr_attr;
567	tavor_mr_options_t	op;
568	tavor_pdhdl_t		pd;
569	tavor_mrhdl_t		mr, mr_old;
570	tavor_hw_cqe_t		*buf;
571	uint32_t		new_prod_indx, old_cons_indx;
572	uint_t			dma_xfer_mode, cq_sync, log_cq_size, maxprot;
573	int			status, i, flag;
574	char			*errormsg;
575
576	TAVOR_TNF_ENTER(tavor_cq_resize);
577
578	/* Use the internal protection domain (PD) for CQs */
579	pd = state->ts_pdhdl_internal;
580
581	/*
582	 * Calculate the appropriate size for the new resized completion queue.
583	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
584	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
585	 * to round the requested size up to the next highest power-of-2
586	 */
587	req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
588	log_cq_size = highbit(req_size);
589
590	/*
591	 * Next we verify that the rounded-up size is valid (i.e. consistent
592	 * with the device limits and/or software-configured limits)
593	 */
594	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
595		/* Set "status" and "errormsg" and goto failure */
596		TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
597		goto cqresize_fail;
598	}
599
600	/*
601	 * Allocate the memory for newly resized Completion Queue.
602	 *
603	 * Note: Although we use the common queue allocation routine, we
604	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
605	 * kernel system memory) for kernel CQs because it would be
606	 * inefficient to have CQs located in DDR memory.  This is the same
607	 * as we do when we first allocate completion queues primarily
608	 * because CQs are read from (by software) more than they are written
609	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
610	 * user-mappable CQs for a similar reason.)
611	 * It is also worth noting that, unlike Tavor QP work queues,
612	 * completion queues do not have the same strict alignment
613	 * requirements.  It is sufficient for the CQ memory to be both
614	 * aligned to and bound to addresses which are a multiple of CQE size.
615	 */
616	new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
617	new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
618	new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
619	if (cq->cq_is_umap) {
620		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
621	} else {
622		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
623	}
624	status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
625	if (status != DDI_SUCCESS) {
626		/* Set "status" and "errormsg" and goto failure */
627		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
628		goto cqresize_fail;
629	}
630	buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
631	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
632
633	/*
634	 * Initialize each of the Completion Queue Entries (CQE) by setting
635	 * their ownership to hardware ("owner" bit set to HW).  This is in
636	 * preparation for the final resize operation (below).
637	 */
638	for (i = 0; i < (1 << log_cq_size); i++) {
639		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
640	}
641
642	/*
643	 * Register the memory for the CQ.  The memory for the CQ must
644	 * be registered in the Tavor TPT tables.  This gives us the LKey
645	 * to specify in the CQ context below.
646	 */
647	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
648	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
649	mr_attr.mr_len	 = new_cqinfo.qa_size;
650	mr_attr.mr_as	 = NULL;
651	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
652	if (cq->cq_is_umap) {
653		dma_xfer_mode = DDI_DMA_CONSISTENT;
654	} else {
655		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
656	}
657	if (dma_xfer_mode == DDI_DMA_STREAMING) {
658		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
659	}
660	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
661	op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
662	op.mro_bind_override_addr = 0;
663	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
664	if (status != DDI_SUCCESS) {
665		tavor_queue_free(state, &new_cqinfo);
666		/* Set "status" and "errormsg" and goto failure */
667		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
668		goto cqresize_fail;
669	}
670	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
671
672	/* Determine if later ddi_dma_sync will be necessary */
673	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
674
675	/* Sync entire "new" CQ for use by hardware (if necessary) */
676	if (cq_sync) {
677		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
678		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
679	}
680
681	/*
682	 * Now we grab the CQ lock.  Since we will be updating the actual
683	 * CQ location and the producer/consumer indexes, we should hold
684	 * the lock.
685	 *
686	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
687	 * holding the "cq_lock" and if we got raised to interrupt level
688	 * by priority inversion, we would not want to block in this routine
689	 * waiting for success.
690	 */
691	mutex_enter(&cq->cq_lock);
692
693	/*
694	 * Determine the current CQ "consumer index".
695	 *
696	 * Note:  This will depend on whether the CQ had previously been
697	 * mapped for user access or whether it is a kernel CQ.  If this
698	 * is a kernel CQ, then all PollCQ() operations have come through
699	 * the IBTF and, hence, the driver's CQ state structure will
700	 * contain the current consumer index.  If, however, the user has
701	 * accessed this CQ by bypassing the driver (OS-bypass), then we
702	 * need to query the firmware to determine the current CQ consumer
703	 * index.  This also assumes that the user process will not continue
704	 * to consume entries while at the same time doing the ResizeCQ()
705	 * operation.  If the user process does not guarantee this, then it
706	 * may see duplicate or missed completions.  But under no
707	 * circumstances should this panic the system.
708	 */
709	if (cq->cq_is_umap) {
710		status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
711		    cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
712		    TAVOR_NOSLEEP);
713		if (status != TAVOR_CMD_SUCCESS) {
714			/* Query CQ has failed, drop CQ lock and cleanup */
715			mutex_exit(&cq->cq_lock);
716			if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
717			    sleepflag) != DDI_SUCCESS) {
718				TAVOR_WARNING(state, "failed to deregister "
719				    "CQ memory");
720			}
721			tavor_queue_free(state, &new_cqinfo);
722			TAVOR_WARNING(state, "failed to find in database");
723
724			/* Set "status" and "errormsg" and goto failure */
725			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
726			    "failed umap lookup");
727			goto cqresize_fail;
728		}
729		old_cons_indx = cqc_entry.cons_indx;
730	} else {
731		old_cons_indx = cq->cq_consindx;
732	}
733
734	/*
735	 * Fill in the CQC entry.  For the resize operation this is the
736	 * final step before attempting the resize operation on the CQC entry.
737	 * We use all of the information collected/calculated above to fill
738	 * in the requisite portions of the CQC.
739	 */
740	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
741	cqc_entry.start_addr_h	= (mr->mr_bindinfo.bi_addr >> 32);
742	cqc_entry.start_addr_l	= (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
743	cqc_entry.log_cq_sz	= log_cq_size;
744	cqc_entry.lkey		= mr->mr_lkey;
745
746	/*
747	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
748	 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
749	 * command).  Note: In general, this operation shouldn't fail.  But
750	 * if it does, we have to undo everything we've done above before
751	 * returning error.  Also note that the status returned may indicate
752	 * the code to return to the IBTF.
753	 */
754	status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
755	    &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
756	if (status != TAVOR_CMD_SUCCESS) {
757		/* Resize attempt has failed, drop CQ lock and cleanup */
758		mutex_exit(&cq->cq_lock);
759		if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
760		    sleepflag) != DDI_SUCCESS) {
761			TAVOR_WARNING(state, "failed to deregister CQ memory");
762		}
763		tavor_queue_free(state, &new_cqinfo);
764		if (status == TAVOR_CMD_BAD_SIZE) {
765			TAVOR_TNF_EXIT(tavor_cq_resize);
766			return (IBT_CQ_SZ_INSUFFICIENT);
767		} else {
768			cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
769			    "%08x\n", status);
770			TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
771			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
772			TAVOR_TNF_EXIT(tavor_cq_resize);
773			return (ibc_get_ci_failure(0));
774		}
775	}
776
777	/*
778	 * The CQ resize attempt was successful.  Before dropping the CQ lock,
779	 * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
780	 * the Tavor firmware guarantees us that sufficient space is set aside
781	 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
782	 * The two parameters to this helper function ("old_cons_indx" and
783	 * "new_prod_indx") essentially indicate the starting index and number
784	 * of any CQEs that might remain in the "old" CQ memory.
785	 */
786	tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
787
788	/* Sync entire "new" CQ for use by hardware (if necessary) */
789	if (cq_sync) {
790		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
791		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
792	}
793
794	/*
795	 * Update the Tavor Completion Queue handle with all the new
796	 * information.  At the same time, save away all the necessary
797	 * information for freeing up the old resources
798	 */
799	mr_old		 = cq->cq_mrhdl;
800	old_cqinfo	 = cq->cq_cqinfo;
801	cq->cq_cqinfo	 = new_cqinfo;
802	cq->cq_consindx	 = 0;
803	cq->cq_buf	 = buf;
804	cq->cq_bufsz	 = (1 << log_cq_size);
805	cq->cq_mrhdl	 = mr;
806	cq->cq_sync	 = cq_sync;
807
808	/*
809	 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
810	 * to a user process, then we need to call devmap_devmem_remap() to
811	 * invalidate the mapping to the CQ memory.  We also need to
812	 * invalidate the CQ tracking information for the user mapping.
813	 */
814	if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
815		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
816		status = devmap_devmem_remap(cq->cq_umap_dhp,
817		    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
818		    DEVMAP_MAPPING_INVALID, NULL);
819		if (status != DDI_SUCCESS) {
820			mutex_exit(&cq->cq_lock);
821			TAVOR_WARNING(state, "failed in CQ memory "
822			    "devmap_devmem_remap()");
823			TAVOR_TNF_EXIT(tavor_cq_free);
824			return (ibc_get_ci_failure(0));
825		}
826		cq->cq_umap_dhp = (devmap_cookie_t)NULL;
827	}
828
829	/*
830	 * Drop the CQ lock now.  The only thing left to do is to free up
831	 * the old resources.
832	 */
833	mutex_exit(&cq->cq_lock);
834
835	/*
836	 * Deregister the memory for the old Completion Queue.  Note: We
837	 * really can't return error here because we have no good way to
838	 * cleanup.  Plus, the deregistration really shouldn't ever happen.
839	 * So, if it does, it is an indication that something has gone
840	 * seriously wrong.  So we print a warning message and return error
841	 * (knowing, of course, that the "old" CQ memory will be leaked)
842	 */
843	status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
844	    sleepflag);
845	if (status != DDI_SUCCESS) {
846		TAVOR_WARNING(state, "failed to deregister old CQ memory");
847		/* Set "status" and "errormsg" and goto failure */
848		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
849		    "failed deregister mr (old)");
850		goto cqresize_fail;
851	}
852
853	/* Free the memory for the old CQ */
854	tavor_queue_free(state, &old_cqinfo);
855
856	/*
857	 * Fill in the return arguments (if necessary).  This includes the
858	 * real new completion queue size.
859	 */
860	if (actual_size != NULL) {
861		*actual_size = (1 << log_cq_size) - 1;
862	}
863
864	TAVOR_TNF_EXIT(tavor_cq_resize);
865	return (DDI_SUCCESS);
866
867cqresize_fail:
868	TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
869	    tnf_string, msg, errormsg);
870	TAVOR_TNF_EXIT(tavor_cq_resize);
871	return (status);
872}
873
874
875/*
876 * tavor_cq_notify()
877 *    Context: Can be called from interrupt or base context.
878 */
879int
880tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
881    ibt_cq_notify_flags_t flags)
882{
883	uint_t		cqnum;
884
885	TAVOR_TNF_ENTER(tavor_cq_notify);
886
887	/*
888	 * Determine if we are trying to get the next completion or the next
889	 * "solicited" completion.  Then hit the appropriate doorbell.
890	 *
891	 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
892	 * regarding why we do not have to do an extra PIO read here, and we
893	 * will not lose an event after writing this doorbell.
894	 */
895	cqnum = cq->cq_cqnum;
896	if (flags == IBT_NEXT_COMPLETION) {
897		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
898		    TAVOR_CQDB_DEFAULT_PARAM);
899
900	} else if (flags == IBT_NEXT_SOLICITED) {
901		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
902		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
903
904	} else {
905		TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
906		    tnf_int, flags, flags);
907		TAVOR_TNF_EXIT(tavor_cq_notify);
908		return (IBT_CQ_NOTIFY_TYPE_INVALID);
909	}
910
911	TAVOR_TNF_EXIT(tavor_cq_notify);
912	return (DDI_SUCCESS);
913}
914
915
916/*
917 * tavor_cq_poll()
918 *    Context: Can be called from interrupt or base context.
919 */
920int
921tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
922    uint_t num_wc, uint_t *num_polled)
923{
924	tavor_hw_cqe_t	*cqe;
925	uint32_t	cons_indx, wrap_around_mask;
926	uint32_t	polled_cnt, num_to_increment;
927	int		status;
928
929	TAVOR_TNF_ENTER(tavor_cq_poll);
930
931	/*
932	 * Check for user-mappable CQ memory.  Note:  We do not allow kernel
933	 * clients to poll CQ memory that is accessible directly by the user.
934	 * If the CQ memory is user accessible, then return an error.
935	 */
936	if (cq->cq_is_umap) {
937		TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
938		    TAVOR_TNF_ERROR, "");
939		TAVOR_TNF_EXIT(tavor_cq_poll);
940		return (IBT_CQ_HDL_INVALID);
941	}
942
943	mutex_enter(&cq->cq_lock);
944
945	/* Get the consumer index */
946	cons_indx = cq->cq_consindx;
947
948	/*
949	 * Calculate the wrap around mask.  Note: This operation only works
950	 * because all Tavor completion queues have power-of-2 sizes
951	 */
952	wrap_around_mask = (cq->cq_bufsz - 1);
953
954	/* Calculate the pointer to the first CQ entry */
955	cqe = &cq->cq_buf[cons_indx];
956
957	/* Sync the current CQE to read */
958	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
959
960	/*
961	 * Keep pulling entries from the CQ until we find an entry owned by
962	 * the hardware.  As long as there the CQE's owned by SW, process
963	 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
964	 * consumer index.  Note:  We only update the consumer index if
965	 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
966	 * it indicates that we are going to "recycle" the CQE (probably
967	 * because it is a error CQE and corresponds to more than one
968	 * completion).
969	 */
970	polled_cnt = 0;
971	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
972		status = tavor_cq_cqe_consume(state, cq, cqe,
973		    &wc_p[polled_cnt++]);
974		if (status == TAVOR_CQ_SYNC_AND_DB) {
975			/* Reset entry to hardware ownership */
976			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
977
978			/* Sync the current CQE for device */
979			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
980
981			/* Increment the consumer index */
982			cons_indx = (cons_indx + 1) & wrap_around_mask;
983
984			/* Update the pointer to the next CQ entry */
985			cqe = &cq->cq_buf[cons_indx];
986
987			/* Sync the next CQE to read */
988			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
989		}
990
991		/*
992		 * If we have run out of space to store work completions,
993		 * then stop and return the ones we have pulled of the CQ.
994		 */
995		if (polled_cnt >= num_wc) {
996			break;
997		}
998	}
999
1000	/*
1001	 * Now we only ring the doorbell (to update the consumer index) if
1002	 * we've actually consumed a CQ entry.  If we have, for example,
1003	 * pulled from a CQE that we are still in the process of "recycling"
1004	 * for error purposes, then we would not update the consumer index.
1005	 */
1006	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
1007		/*
1008		 * Post doorbell to update the consumer index.  Doorbell
1009		 * value indicates number of entries consumed (minus 1)
1010		 */
1011		if (cons_indx > cq->cq_consindx) {
1012			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1013		} else {
1014			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1015			    cq->cq_consindx) - 1;
1016		}
1017		cq->cq_consindx = cons_indx;
1018		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1019		    cq->cq_cqnum, num_to_increment);
1020
1021	} else if (polled_cnt == 0) {
1022		/*
1023		 * If the CQ is empty, we can try to free up some of the WRID
1024		 * list containers.  See tavor_wr.c for more details on this
1025		 * operation.
1026		 */
1027		tavor_wrid_cq_reap(cq);
1028	}
1029
1030	mutex_exit(&cq->cq_lock);
1031
1032	/* Set "num_polled" (if necessary) */
1033	if (num_polled != NULL) {
1034		*num_polled = polled_cnt;
1035	}
1036
1037	/* Set CQ_EMPTY condition if needed, otherwise return success */
1038	if (polled_cnt == 0) {
1039		status = IBT_CQ_EMPTY;
1040	} else {
1041		status = DDI_SUCCESS;
1042	}
1043
1044	/*
1045	 * Check if the system is currently panicking.  If it is, then call
1046	 * the Tavor interrupt service routine.  This step is necessary here
1047	 * because we might be in a polled I/O mode and without the call to
1048	 * tavor_isr() - and its subsequent calls to poll and rearm each
1049	 * event queue - we might overflow our EQs and render the system
1050	 * unable to sync/dump.
1051	 */
1052	if (ddi_in_panic() != 0) {
1053		(void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1054	}
1055
1056	TAVOR_TNF_EXIT(tavor_cq_poll);
1057	return (status);
1058}
1059
1060
1061/*
1062 * tavor_cq_handler()
1063 *    Context: Only called from interrupt context
1064 */
1065int
1066tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1067    tavor_hw_eqe_t *eqe)
1068{
1069	tavor_cqhdl_t		cq;
1070	uint_t			cqnum;
1071	uint_t			eqe_evttype;
1072
1073	TAVOR_TNF_ENTER(tavor_cq_handler);
1074
1075	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1076
1077	ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1078	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1079
1080	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1081		TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1082		    TAVOR_TNF_ERROR, "");
1083		tavor_eq_overflow_handler(state, eq, eqe);
1084
1085		TAVOR_TNF_EXIT(tavor_cq_handler);
1086		return (DDI_FAILURE);
1087	}
1088
1089
1090	/* Get the CQ handle from CQ number in event descriptor */
1091	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1092	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1093
1094	/*
1095	 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1096	 * This operation is to enable subsequent CQ doorbells (e.g. those
1097	 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1098	 */
1099	tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1100
1101	/*
1102	 * If the CQ handle is NULL, this is probably an indication
1103	 * that the CQ has been freed already.  In which case, we
1104	 * should not deliver this event.
1105	 *
1106	 * We also check that the CQ number in the handle is the
1107	 * same as the CQ number in the event queue entry.  This
1108	 * extra check allows us to handle the case where a CQ was
1109	 * freed and then allocated again in the time it took to
1110	 * handle the event queue processing.  By constantly incrementing
1111	 * the non-constrained portion of the CQ number every time
1112	 * a new CQ is allocated, we mitigate (somewhat) the chance
1113	 * that a stale event could be passed to the client's CQ
1114	 * handler.
1115	 *
1116	 * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1117	 * means that we've have either received this event before we
1118	 * finished attaching to the IBTF or we've received it while we
1119	 * are in the process of detaching.
1120	 */
1121	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1122	    (state->ts_ibtfpriv != NULL)) {
1123		TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1124	} else {
1125		TNF_PROBE_2(tavor_cq_handler_dropped_event,
1126		    TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1127		    tnf_uint, hdl_cqnum, cqnum);
1128	}
1129
1130	TAVOR_TNF_EXIT(tavor_cq_handler);
1131	return (DDI_SUCCESS);
1132}
1133
1134
1135/*
1136 * tavor_cq_err_handler()
1137 *    Context: Only called from interrupt context
1138 */
1139int
1140tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1141    tavor_hw_eqe_t *eqe)
1142{
1143	tavor_cqhdl_t		cq;
1144	uint_t			cqnum;
1145	ibc_async_event_t	event;
1146	ibt_async_code_t	type;
1147	uint_t			eqe_evttype;
1148
1149	TAVOR_TNF_ENTER(tavor_cq_err_handler);
1150
1151	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1152
1153	ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1154	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1155
1156	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1157		TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1158		    TAVOR_TNF_ERROR, "");
1159		tavor_eq_overflow_handler(state, eq, eqe);
1160
1161		TAVOR_TNF_EXIT(tavor_cq_err_handler);
1162		return (DDI_FAILURE);
1163	}
1164
1165	/* cmn_err(CE_CONT, "CQ Error handler\n"); */
1166
1167	/* Get the CQ handle from CQ number in event descriptor */
1168	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1169	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1170
1171	/*
1172	 * If the CQ handle is NULL, this is probably an indication
1173	 * that the CQ has been freed already.  In which case, we
1174	 * should not deliver this event.
1175	 *
1176	 * We also check that the CQ number in the handle is the
1177	 * same as the CQ number in the event queue entry.  This
1178	 * extra check allows us to handle the case where a CQ was
1179	 * freed and then allocated again in the time it took to
1180	 * handle the event queue processing.  By constantly incrementing
1181	 * the non-constrained portion of the CQ number every time
1182	 * a new CQ is allocated, we mitigate (somewhat) the chance
1183	 * that a stale event could be passed to the client's CQ
1184	 * handler.
1185	 *
1186	 * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1187	 * means that we've have either received this event before we
1188	 * finished attaching to the IBTF or we've received it while we
1189	 * are in the process of detaching.
1190	 */
1191	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1192	    (state->ts_ibtfpriv != NULL)) {
1193		event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1194		type		= IBT_ERROR_CQ;
1195
1196		TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1197	} else {
1198		TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1199		    TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1200		    tnf_uint, hdl_cqnum, cqnum);
1201	}
1202
1203	TAVOR_TNF_EXIT(tavor_cq_err_handler);
1204	return (DDI_SUCCESS);
1205}
1206
1207
1208/*
1209 * tavor_cq_refcnt_inc()
1210 *    Context: Can be called from interrupt or base context.
1211 */
1212int
1213tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1214{
1215	/*
1216	 * Increment the completion queue's reference count.  Note: In order
1217	 * to ensure compliance with IBA C11-15, we must ensure that a given
1218	 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1219	 * This is accomplished here by keeping track of how the referenced
1220	 * CQ is being used.
1221	 */
1222	mutex_enter(&cq->cq_lock);
1223	TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1224	    tnf_uint, refcnt, cq->cq_refcnt);
1225	if (cq->cq_refcnt == 0) {
1226		cq->cq_is_special = is_special;
1227	} else {
1228		if (cq->cq_is_special != is_special) {
1229			mutex_exit(&cq->cq_lock);
1230			return (DDI_FAILURE);
1231		}
1232	}
1233	cq->cq_refcnt++;
1234	mutex_exit(&cq->cq_lock);
1235	return (DDI_SUCCESS);
1236}
1237
1238
1239/*
1240 * tavor_cq_refcnt_dec()
1241 *    Context: Can be called from interrupt or base context.
1242 */
1243void
1244tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1245{
1246	/* Decrement the completion queue's reference count */
1247	mutex_enter(&cq->cq_lock);
1248	cq->cq_refcnt--;
1249	TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1250	    tnf_uint, refcnt, cq->cq_refcnt);
1251	mutex_exit(&cq->cq_lock);
1252}
1253
1254
1255/*
1256 * tavor_cq_doorbell()
1257 *    Context: Can be called from interrupt or base context.
1258 */
1259static void
1260tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1261    uint32_t cq_param)
1262{
1263	uint64_t	doorbell = 0;
1264
1265	/* Build the doorbell from the parameters */
1266	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1267	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1268
1269	TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1270	    tnf_ulong, doorbell, doorbell);
1271
1272	/* Write the doorbell to UAR */
1273	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1274	    doorbell);
1275}
1276
1277
1278/*
1279 * tavor_cqhdl_from_cqnum()
1280 *    Context: Can be called from interrupt or base context.
1281 *
1282 *    This routine is important because changing the unconstrained
1283 *    portion of the CQ number is critical to the detection of a
1284 *    potential race condition in the CQ handler code (i.e. the case
1285 *    where a CQ is freed and alloc'd again before an event for the
1286 *    "old" CQ can be handled).
1287 *
1288 *    While this is not a perfect solution (not sure that one exists)
1289 *    it does help to mitigate the chance that this race condition will
1290 *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1291 *    this solution does not scale well because the number of constrained
1292 *    bits increases (and, hence, the number of unconstrained bits
1293 *    decreases) as the number of supported CQs grows.  For small and
1294 *    intermediate values, it should hopefully provide sufficient
1295 *    protection.
1296 */
1297tavor_cqhdl_t
1298tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1299{
1300	uint_t	cqindx, cqmask;
1301
1302	/* Calculate the CQ table index from the cqnum */
1303	cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1304	cqindx = cqnum & cqmask;
1305	return (state->ts_cqhdl[cqindx]);
1306}
1307
1308
1309/*
1310 * tavor_cq_cqe_consume()
1311 *    Context: Can be called from interrupt or base context.
1312 */
1313static int
1314tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1315    tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1316{
1317	uint_t		flags, type, opcode, qpnum, qp1_indx;
1318	int		status;
1319
1320	TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1321
1322	/*
1323	 * Determine if this is an "error" CQE by examining "opcode".  If it
1324	 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1325	 * whatever status it returns.  Otherwise, this is a successful
1326	 * completion.
1327	 */
1328	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1329	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1330	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1331		status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1332		TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1333		return (status);
1334	}
1335
1336	/*
1337	 * Fetch the Work Request ID using the information in the CQE.
1338	 * See tavor_wr.c for more details.
1339	 */
1340	wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1341
1342	/*
1343	 * Parse the CQE opcode to determine completion type.  This will set
1344	 * not only the type of the completion, but also any flags that might
1345	 * be associated with it (e.g. whether immediate data is present).
1346	 */
1347	flags = IBT_WC_NO_FLAGS;
1348	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1349
1350		/* Send CQE */
1351		switch (opcode) {
1352		case TAVOR_CQE_SND_RDMAWR_IMM:
1353			flags |= IBT_WC_IMMED_DATA_PRESENT;
1354			/* FALLTHROUGH */
1355		case TAVOR_CQE_SND_RDMAWR:
1356			type = IBT_WRC_RDMAW;
1357			break;
1358
1359		case TAVOR_CQE_SND_SEND_IMM:
1360			flags |= IBT_WC_IMMED_DATA_PRESENT;
1361			/* FALLTHROUGH */
1362		case TAVOR_CQE_SND_SEND:
1363			type = IBT_WRC_SEND;
1364			break;
1365
1366		case TAVOR_CQE_SND_RDMARD:
1367			type = IBT_WRC_RDMAR;
1368			break;
1369
1370		case TAVOR_CQE_SND_ATOMIC_CS:
1371			type = IBT_WRC_CSWAP;
1372			break;
1373
1374		case TAVOR_CQE_SND_ATOMIC_FA:
1375			type = IBT_WRC_FADD;
1376			break;
1377
1378		case TAVOR_CQE_SND_BIND_MW:
1379			type = IBT_WRC_BIND;
1380			break;
1381
1382		default:
1383			TAVOR_WARNING(state, "unknown send CQE type");
1384			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1385			TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1386			    TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1387			TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1388			return (TAVOR_CQ_SYNC_AND_DB);
1389		}
1390	} else {
1391
1392		/* Receive CQE */
1393		switch (opcode & 0x1F) {
1394		case TAVOR_CQE_RCV_RECV_IMM:
1395			/* FALLTHROUGH */
1396		case TAVOR_CQE_RCV_RECV_IMM2:
1397			/*
1398			 * Note:  According to the Tavor PRM, all QP1 recv
1399			 * completions look like the result of a Send with
1400			 * Immediate.  They are not, however, (MADs are Send
1401			 * Only) so we need to check the QP number and set
1402			 * the flag only if it is non-QP1.
1403			 */
1404			qpnum	 = TAVOR_CQE_QPNUM_GET(cq, cqe);
1405			qp1_indx = state->ts_spec_qp1->tr_indx;
1406			if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1407				flags |= IBT_WC_IMMED_DATA_PRESENT;
1408			}
1409			/* FALLTHROUGH */
1410		case TAVOR_CQE_RCV_RECV:
1411			/* FALLTHROUGH */
1412		case TAVOR_CQE_RCV_RECV2:
1413			type = IBT_WRC_RECV;
1414			break;
1415
1416		case TAVOR_CQE_RCV_RDMAWR_IMM:
1417			/* FALLTHROUGH */
1418		case TAVOR_CQE_RCV_RDMAWR_IMM2:
1419			flags |= IBT_WC_IMMED_DATA_PRESENT;
1420			type = IBT_WRC_RECV_RDMAWI;
1421			break;
1422
1423		default:
1424			TAVOR_WARNING(state, "unknown recv CQE type");
1425			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1426			TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1427			    TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1428			TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1429			return (TAVOR_CQ_SYNC_AND_DB);
1430		}
1431	}
1432	wc->wc_type = type;
1433
1434	/*
1435	 * Check for GRH, update the flags, then fill in "wc_flags" field
1436	 * in the work completion
1437	 */
1438	if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1439		flags |= IBT_WC_GRH_PRESENT;
1440	}
1441	wc->wc_flags = flags;
1442
1443	/* If we got here, completion status must be success */
1444	wc->wc_status = IBT_WC_SUCCESS;
1445
1446	/*
1447	 * Parse the remaining contents of the CQE into the work completion.
1448	 * This means filling in SL, QP number, SLID, immediate data, etc.
1449	 * Note:  Not all of these fields are valid in a given completion.
1450	 * Many of them depend on the actual type of completion.  So we fill
1451	 * in all of the fields and leave it up to the IBTF and consumer to
1452	 * sort out which are valid based on their context.
1453	 */
1454	wc->wc_sl	  = TAVOR_CQE_SL_GET(cq, cqe);
1455	wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1456	wc->wc_qpn	  = TAVOR_CQE_DQPN_GET(cq, cqe);
1457	wc->wc_res_hash	  = 0;
1458	wc->wc_slid	  = TAVOR_CQE_DLID_GET(cq, cqe);
1459	wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1460	wc->wc_pkey_ix	  = (wc->wc_immed_data >> 16);
1461
1462	/*
1463	 * Depending on whether the completion was a receive or a send
1464	 * completion, fill in "bytes transferred" as appropriate.  Also,
1465	 * if necessary, fill in the "path bits" field.
1466	 */
1467	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1468		wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1469		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1470
1471	} else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1472	    (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1473		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1474	}
1475
1476	TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1477	return (TAVOR_CQ_SYNC_AND_DB);
1478}
1479
1480
1481/*
1482 * tavor_cq_errcqe_consume()
1483 *    Context: Can be called from interrupt or base context.
1484 */
1485static int
1486tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1487    tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1488{
1489	uint64_t		next_wqeaddr;
1490	uint32_t		imm_eth_pkey_cred;
1491	uint_t			nextwqesize, dbd;
1492	uint_t			doorbell_cnt, status;
1493	tavor_wrid_entry_t	wre;
1494
1495	TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1496
1497	/*
1498	 * Fetch the Work Request ID using the information in the CQE.
1499	 * See tavor_wr.c for more details.
1500	 */
1501	wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1502
1503	/*
1504	 * Parse the CQE opcode to determine completion type.  We know that
1505	 * the CQE is an error completion, so we extract only the completion
1506	 * status here.
1507	 */
1508	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1509	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1510	switch (status) {
1511	case TAVOR_CQE_LOC_LEN_ERR:
1512		status = IBT_WC_LOCAL_LEN_ERR;
1513		break;
1514
1515	case TAVOR_CQE_LOC_OP_ERR:
1516		status = IBT_WC_LOCAL_QP_OP_ERR;
1517		break;
1518
1519	case TAVOR_CQE_LOC_PROT_ERR:
1520		status = IBT_WC_LOCAL_PROTECT_ERR;
1521		break;
1522
1523	case TAVOR_CQE_WR_FLUSHED_ERR:
1524		status = IBT_WC_WR_FLUSHED_ERR;
1525		break;
1526
1527	case TAVOR_CQE_MW_BIND_ERR:
1528		status = IBT_WC_MEM_WIN_BIND_ERR;
1529		break;
1530
1531	case TAVOR_CQE_BAD_RESPONSE_ERR:
1532		status = IBT_WC_BAD_RESPONSE_ERR;
1533		break;
1534
1535	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1536		status = IBT_WC_LOCAL_ACCESS_ERR;
1537		break;
1538
1539	case TAVOR_CQE_REM_INV_REQ_ERR:
1540		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1541		break;
1542
1543	case TAVOR_CQE_REM_ACC_ERR:
1544		status = IBT_WC_REMOTE_ACCESS_ERR;
1545		break;
1546
1547	case TAVOR_CQE_REM_OP_ERR:
1548		status = IBT_WC_REMOTE_OP_ERR;
1549		break;
1550
1551	case TAVOR_CQE_TRANS_TO_ERR:
1552		status = IBT_WC_TRANS_TIMEOUT_ERR;
1553		break;
1554
1555	case TAVOR_CQE_RNRNAK_TO_ERR:
1556		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1557		break;
1558
1559	/*
1560	 * The following error codes are not supported in the Tavor driver
1561	 * as they relate only to Reliable Datagram completion statuses:
1562	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1563	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1564	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1565	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1566	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1567	 *    case TAVOR_CQE_LOC_EEC_ERR:
1568	 */
1569
1570	default:
1571		TAVOR_WARNING(state, "unknown error CQE status");
1572		status = IBT_WC_LOCAL_QP_OP_ERR;
1573		TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1574		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1575		break;
1576	}
1577	wc->wc_status = status;
1578
1579	/*
1580	 * Now we do all the checking that's necessary to handle completion
1581	 * queue entry "recycling"
1582	 *
1583	 * It is not necessary here to try to sync the WQE as we are only
1584	 * attempting to read from the Work Queue (and hardware does not
1585	 * write to it).
1586	 */
1587
1588	/*
1589	 * We can get doorbell info, WQE address, size for the next WQE
1590	 * from the "wre" (which was filled in above in the call to the
1591	 * tavor_wrid_get_entry() routine)
1592	 */
1593	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1594	next_wqeaddr = wre.wr_wqeaddrsz;
1595	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1596
1597	/*
1598	 * Get the doorbell count from the CQE.  This indicates how many
1599	 * completions this one CQE represents.
1600	 */
1601	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1602
1603	/*
1604	 * Determine if we're ready to consume this CQE yet or not.  If the
1605	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1606	 * is down to zero, then this is the last/only completion represented
1607	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1608	 * current CQE needs to be recycled (see below).
1609	 */
1610	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1611		/*
1612		 * Consume the CQE
1613		 *    Return status to indicate that doorbell and sync may be
1614		 *    necessary.
1615		 */
1616		TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1617		return (TAVOR_CQ_SYNC_AND_DB);
1618
1619	} else {
1620		/*
1621		 * Recycle the CQE for use in the next PollCQ() call
1622		 *    Decrement the doorbell count, modify the error status,
1623		 *    and update the WQE address and size (to point to the
1624		 *    next WQE on the chain.  Put these update entries back
1625		 *    into the CQE.
1626		 *    Despite the fact that we have updated the CQE, it is not
1627		 *    necessary for us to attempt to sync this entry just yet
1628		 *    as we have not changed the "hardware's view" of the
1629		 *    entry (i.e. we have not modified the "owner" bit - which
1630		 *    is all that the Tavor hardware really cares about.
1631		 */
1632		doorbell_cnt = doorbell_cnt - dbd;
1633		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1634		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1635		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1636		TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1637		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1638
1639		TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1640		return (TAVOR_CQ_RECYCLE_ENTRY);
1641	}
1642}
1643
1644
1645/*
1646 * tavor_cqe_sync()
1647 *    Context: Can be called from interrupt or base context.
1648 */
1649static void
1650tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1651{
1652	ddi_dma_handle_t	dmahdl;
1653	off_t			offset;
1654	int			status;
1655
1656	TAVOR_TNF_ENTER(tavor_cqe_sync);
1657
1658	/* Determine if CQ needs to be synced or not */
1659	if (cq->cq_sync == 0) {
1660		TAVOR_TNF_EXIT(tavor_cqe_sync);
1661		return;
1662	}
1663
1664	/* Get the DMA handle from CQ context */
1665	dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1666
1667	/* Calculate offset of next CQE */
1668	offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1669	status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1670	if (status != DDI_SUCCESS) {
1671		TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1672		    TAVOR_TNF_ERROR, "");
1673		TAVOR_TNF_EXIT(tavor_cqe_sync);
1674		return;
1675	}
1676
1677	TAVOR_TNF_EXIT(tavor_cqe_sync);
1678}
1679
1680
1681/*
1682 * tavor_cq_resize_helper()
1683 *    Context: Can be called only from user or kernel context.
1684 */
1685static void
1686tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1687    uint32_t old_cons_indx, uint32_t num_newcqe)
1688{
1689	tavor_hw_cqe_t	*old_cqe, *new_cqe;
1690	uint32_t	new_cons_indx, wrap_around_mask;
1691	int		i;
1692
1693	TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1694
1695	ASSERT(MUTEX_HELD(&cq->cq_lock));
1696
1697	/* Get the consumer index */
1698	new_cons_indx = 0;
1699
1700	/*
1701	 * Calculate the wrap around mask.  Note: This operation only works
1702	 * because all Tavor completion queues have power-of-2 sizes
1703	 */
1704	wrap_around_mask = (cq->cq_bufsz - 1);
1705
1706	/*
1707	 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1708	 * and the first CQ entry in the "new" CQ
1709	 */
1710	old_cqe = &cq->cq_buf[old_cons_indx];
1711	new_cqe = &new_cqbuf[new_cons_indx];
1712
1713	/* Sync entire "old" CQ for use by software (if necessary). */
1714	if (cq->cq_sync) {
1715		(void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1716		    0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1717	}
1718
1719	/*
1720	 * Keep pulling entries from the "old" CQ until we find an entry owned
1721	 * by the hardware.  Process each entry by copying it into the "new"
1722	 * CQ and updating respective indices and pointers in the "old" CQ.
1723	 */
1724	for (i = 0; i < num_newcqe; i++) {
1725
1726		/* Copy this old CQE into the "new_cqe" pointer */
1727		bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1728
1729		/* Increment the consumer index (for both CQs) */
1730		old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1731		new_cons_indx = (new_cons_indx + 1);
1732
1733		/* Update the pointer to the next CQ entry */
1734		old_cqe = &cq->cq_buf[old_cons_indx];
1735		new_cqe = &new_cqbuf[new_cons_indx];
1736	}
1737
1738	TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1739}
1740
1741
1742/*
1743 * tavor_cq_numcalc()
1744 *    Context: Can be called from interrupt or base context.
1745 */
1746static void
1747tavor_cq_numcalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1748{
1749	uint32_t	tmp, log_num_cq;
1750
1751	/*
1752	 * Generate a simple key from counter.  Note:  We increment this
1753	 * static variable _intentionally_ without any kind of mutex around
1754	 * it.  First, single-threading all operations through a single lock
1755	 * would be a bad idea (from a performance point-of-view).  Second,
1756	 * the upper "unconstrained" bits don't really have to be unique
1757	 * because the lower bits are guaranteed to be (although we do make a
1758	 * best effort to ensure that they are).  Third, the window for the
1759	 * race (where both threads read and update the counter at the same
1760	 * time) is incredibly small.
1761	 */
1762	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_cqnum_cnt))
1763	log_num_cq = state->ts_cfg_profile->cp_log_num_cq;
1764	tmp = (tavor_debug_cqnum_cnt++) << log_num_cq;
1765	*key = (tmp | indx) & TAVOR_CQ_MAXNUMBER_MSK;
1766}
1767
1768/*
1769 * tavor_cq_srq_entries_flush()
1770 * Context: Can be called from interrupt or base context.
1771 */
1772void
1773tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1774{
1775	tavor_cqhdl_t		cq;
1776	tavor_workq_hdr_t	*wqhdr;
1777	tavor_hw_cqe_t		*cqe;
1778	tavor_hw_cqe_t		*next_cqe;
1779	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1780	uint32_t		new_indx, check_indx, indx;
1781	uint32_t		num_to_increment;
1782	int			cqe_qpnum, cqe_type;
1783	int			outstanding_cqes, removed_cqes;
1784	int			i;
1785
1786	ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1787
1788	cq = qp->qp_rq_cqhdl;
1789	wqhdr = qp->qp_rq_wqhdr;
1790
1791	ASSERT(wqhdr->wq_wrid_post != NULL);
1792	ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1793
1794	/*
1795	 * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1796	 * clients to modify any userland mapping CQ.  If the CQ is
1797	 * user-mapped, then we simply return here, and this "flush" function
1798	 * becomes a NO-OP in this case.
1799	 */
1800	if (cq->cq_is_umap) {
1801		return;
1802	}
1803
1804	/* Get the consumer index */
1805	cons_indx = cq->cq_consindx;
1806
1807	/*
1808	 * Calculate the wrap around mask.  Note: This operation only works
1809	 * because all Tavor completion queues have power-of-2 sizes
1810	 */
1811	wrap_around_mask = (cq->cq_bufsz - 1);
1812
1813	/* Calculate the pointer to the first CQ entry */
1814	cqe = &cq->cq_buf[cons_indx];
1815
1816	/* Sync the current CQE to read */
1817	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1818
1819	/*
1820	 * Loop through the CQ looking for entries owned by software.  If an
1821	 * entry is owned by software then we increment an 'outstanding_cqes'
1822	 * count to know how many entries total we have on our CQ.  We use this
1823	 * value further down to know how many entries to loop through looking
1824	 * for our same QP number.
1825	 */
1826	outstanding_cqes = 0;
1827	tail_cons_indx = cons_indx;
1828	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1829		/* increment total cqes count */
1830		outstanding_cqes++;
1831
1832		/* increment the consumer index */
1833		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1834
1835		/* update the pointer to the next cq entry */
1836		cqe = &cq->cq_buf[tail_cons_indx];
1837
1838		/* sync the next cqe to read */
1839		tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1840	}
1841
1842	/*
1843	 * Using the 'tail_cons_indx' that was just set, we now know how many
1844	 * total CQEs possible there are.  Set the 'check_indx' and the
1845	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1846	 */
1847	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1848
1849	for (i = 0; i < outstanding_cqes; i++) {
1850		cqe = &cq->cq_buf[check_indx];
1851
1852		/* Grab QP number from CQE */
1853		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1854		cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1855
1856		/*
1857		 * If the QP number is the same in the CQE as the QP that we
1858		 * have on this SRQ, then we must free up the entry off the
1859		 * SRQ.  We also make sure that the completion type is of the
1860		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1861		 * this CQ will be left as-is.  The handling of returning
1862		 * entries back to HW ownership happens further down.
1863		 */
1864		if (cqe_qpnum == qp->qp_qpnum &&
1865		    cqe_type == TAVOR_COMPLETION_RECV) {
1866
1867			/* Add back to SRQ free list */
1868			(void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1869			    cq, cqe);
1870		} else {
1871			/* Do Copy */
1872			if (check_indx != new_indx) {
1873				next_cqe = &cq->cq_buf[new_indx];
1874
1875				/*
1876				 * Copy the CQE into the "next_cqe"
1877				 * pointer.
1878				 */
1879				bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1880			}
1881			new_indx = (new_indx - 1) & wrap_around_mask;
1882		}
1883		/* Move index to next CQE to check */
1884		check_indx = (check_indx - 1) & wrap_around_mask;
1885	}
1886
1887	/* Initialize removed cqes count */
1888	removed_cqes = 0;
1889
1890	/* If an entry was removed */
1891	if (check_indx != new_indx) {
1892
1893		/*
1894		 * Set current pointer back to the beginning consumer index.
1895		 * At this point, all unclaimed entries have been copied to the
1896		 * index specified by 'new_indx'.  This 'new_indx' will be used
1897		 * as the new consumer index after we mark all freed entries as
1898		 * having HW ownership.  We do that here.
1899		 */
1900
1901		/* Loop through all entries until we reach our new pointer */
1902		for (indx = cons_indx; indx <= new_indx;
1903		    indx = (indx + 1) & wrap_around_mask) {
1904			removed_cqes++;
1905			cqe = &cq->cq_buf[indx];
1906
1907			/* Reset entry to hardware ownership */
1908			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1909		}
1910	}
1911
1912	/*
1913	 * Update consumer index to be the 'new_indx'.  This moves it past all
1914	 * removed entries.  Because 'new_indx' is pointing to the last
1915	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1916	 * the first HW owned entry.
1917	 */
1918	cons_indx = (new_indx + 1) & wrap_around_mask;
1919
1920	/*
1921	 * Now we only ring the doorbell (to update the consumer index) if
1922	 * we've actually consumed a CQ entry.  If we found no QP number
1923	 * matches above, then we would not have removed anything.  So only if
1924	 * something was removed do we ring the doorbell.
1925	 */
1926	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1927		/*
1928		 * Post doorbell to update the consumer index.  Doorbell
1929		 * value indicates number of entries consumed (minus 1)
1930		 */
1931		if (cons_indx > cq->cq_consindx) {
1932			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1933		} else {
1934			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1935			    cq->cq_consindx) - 1;
1936		}
1937		cq->cq_consindx = cons_indx;
1938
1939		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1940		    cq->cq_cqnum, num_to_increment);
1941	}
1942}
1943