tavor_qp.c revision 9517:b4839b0aa7a4
167754Smsmith/*
267754Smsmith * CDDL HEADER START
367754Smsmith *
467754Smsmith * The contents of this file are subject to the terms of the
567754Smsmith * Common Development and Distribution License (the "License").
667754Smsmith * You may not use this file except in compliance with the License.
7217365Sjkim *
8217365Sjkim * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
970243Smsmith * or http://www.opensolaris.org/os/licensing.
1067754Smsmith * See the License for the specific language governing permissions
11217365Sjkim * and limitations under the License.
12217365Sjkim *
13217365Sjkim * When distributing Covered Code, include this CDDL HEADER in each
14217365Sjkim * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15217365Sjkim * If applicable, add the following below this CDDL HEADER, with the
16217365Sjkim * fields enclosed by brackets "[]" replaced with your own identifying
17217365Sjkim * information: Portions Copyright [yyyy] [name of copyright owner]
18217365Sjkim *
19217365Sjkim * CDDL HEADER END
20217365Sjkim */
21217365Sjkim
22217365Sjkim/*
23217365Sjkim * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24217365Sjkim * Use is subject to license terms.
2567754Smsmith */
26217365Sjkim
27217365Sjkim/*
28217365Sjkim * tavor_qp.c
2967754Smsmith *    Tavor Queue Pair Processing Routines
30217365Sjkim *
31217365Sjkim *    Implements all the routines necessary for allocating, freeing, and
32217365Sjkim *    querying the Tavor queue pairs.
33217365Sjkim */
34217365Sjkim
35217365Sjkim#include <sys/types.h>
36217365Sjkim#include <sys/conf.h>
37217365Sjkim#include <sys/ddi.h>
38217365Sjkim#include <sys/sunddi.h>
39217365Sjkim#include <sys/modctl.h>
40217365Sjkim#include <sys/bitmap.h>
41217365Sjkim#include <sys/sysmacros.h>
42217365Sjkim
4367754Smsmith#include <sys/ib/adapters/tavor/tavor.h>
4467754Smsmith#include <sys/ib/ib_pkt_hdrs.h>
4567754Smsmith
4667754Smsmithstatic int tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp,
47193341Sjkim    tavor_rsrc_t *qpc);
48193341Sjkimstatic int tavor_qpn_avl_compare(const void *q, const void *e);
49193341Sjkimstatic int tavor_special_qp_rsrc_alloc(tavor_state_t *state,
50193341Sjkim    ibt_sqp_type_t type, uint_t port, tavor_rsrc_t **qp_rsrc);
51193341Sjkimstatic int tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
5267754Smsmith    uint_t port);
5377424Smsmithstatic void tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
5491116Smsmith    tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
5567754Smsmith
56151937Sjkim/*
5767754Smsmith * tavor_qp_alloc()
58151937Sjkim *    Context: Can be called only from user or kernel context.
59151937Sjkim */
60151937Sjkimint
61151937Sjkimtavor_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
62151937Sjkim    uint_t sleepflag, tavor_qp_options_t *op)
63151937Sjkim{
64151937Sjkim	tavor_rsrc_pool_info_t	*rsrc_pool;
65151937Sjkim	tavor_rsrc_t		*qpc, *rsrc, *rdb;
66151937Sjkim	tavor_umap_db_entry_t	*umapdb;
67151937Sjkim	tavor_qphdl_t		qp;
68151937Sjkim	ibt_qp_alloc_attr_t	*attr_p;
69151937Sjkim	ibt_qp_type_t		type;
70151937Sjkim	ibtl_qp_hdl_t		ibt_qphdl;
71151937Sjkim	ibt_chan_sizes_t	*queuesz_p;
72167802Sjkim	ib_qpn_t		*qpn;
73167802Sjkim	tavor_qphdl_t		*qphdl;
74167802Sjkim	ibt_mr_attr_t		mr_attr;
75167802Sjkim	tavor_mr_options_t	mr_op;
76167802Sjkim	tavor_srqhdl_t		srq;
77167802Sjkim	tavor_pdhdl_t		pd;
78151937Sjkim	tavor_cqhdl_t		sq_cq, rq_cq;
79167802Sjkim	tavor_mrhdl_t		mr;
8067754Smsmith	uint64_t		value, qp_desc_off;
8167754Smsmith	uint32_t		*sq_buf, *rq_buf;
8267754Smsmith	uint32_t		log_qp_sq_size, log_qp_rq_size;
8367754Smsmith	uint32_t		sq_size, rq_size;
8467754Smsmith	uint32_t		sq_wqe_size, rq_wqe_size;
8567754Smsmith	uint32_t		max_rdb, max_sgl, uarpg;
8667754Smsmith	uint_t			wq_location, dma_xfer_mode, qp_is_umap;
8767754Smsmith	uint_t			qp_srq_en;
8867754Smsmith	int			status, flag;
8967754Smsmith	char			*errormsg;
9067754Smsmith
9167754Smsmith	TAVOR_TNF_ENTER(tavor_qp_alloc);
9267754Smsmith
9367754Smsmith	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p))
9467754Smsmith
9567754Smsmith	/*
9667754Smsmith	 * Check the "options" flag.  Currently this flag tells the driver
9767754Smsmith	 * whether or not the QP's work queues should be come from normal
9867754Smsmith	 * system memory or whether they should be allocated from DDR memory.
9967754Smsmith	 */
10067754Smsmith	if (op == NULL) {
101167802Sjkim		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
10267754Smsmith	} else {
10367754Smsmith		wq_location = op->qpo_wq_loc;
10482367Smsmith	}
10582367Smsmith
106138287Smarks	/*
107138287Smarks	 * Extract the necessary info from the tavor_qp_info_t structure
10867754Smsmith	 */
10999146Siwasaki	attr_p	  = qpinfo->qpi_attrp;
11067754Smsmith	type	  = qpinfo->qpi_type;
11199146Siwasaki	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
11267754Smsmith	queuesz_p = qpinfo->qpi_queueszp;
11367754Smsmith	qpn	  = qpinfo->qpi_qpn;
11467754Smsmith	qphdl	  = &qpinfo->qpi_qphdl;
11567754Smsmith
116209746Sjkim	/*
117209746Sjkim	 * Determine whether QP is being allocated for userland access or
11867754Smsmith	 * whether it is being allocated for kernel access.  If the QP is
11967754Smsmith	 * being allocated for userland access, then lookup the UAR doorbell
120167802Sjkim	 * page number for the current process.  Note:  If this is not found
12167754Smsmith	 * (e.g. if the process has not previously open()'d the Tavor driver),
12267754Smsmith	 * then an error is returned.
123114237Snjl	 */
124209746Sjkim	qp_is_umap = (attr_p->qp_alloc_flags & IBT_QP_USER_MAP) ? 1 : 0;
125209746Sjkim	if (qp_is_umap) {
126102550Siwasaki		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
127102550Siwasaki		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
128102550Siwasaki		if (status != DDI_SUCCESS) {
12999146Siwasaki			/* Set "status" and "errormsg" and goto failure */
130114237Snjl			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
13182367Smsmith			goto qpalloc_fail;
132209746Sjkim		}
13382367Smsmith		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
134209746Sjkim	}
13567754Smsmith
13667754Smsmith	/*
13767754Smsmith	 * Determine whether QP is being associated with an SRQ
13867754Smsmith	 */
13967754Smsmith	qp_srq_en = (attr_p->qp_alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0;
14077424Smsmith	if (qp_srq_en) {
14167754Smsmith		/*
14267754Smsmith		 * Check for valid SRQ handle pointers
14367754Smsmith		 */
14467754Smsmith		if (attr_p->qp_ibc_srq_hdl == NULL) {
14567754Smsmith			/* Set "status" and "errormsg" and goto failure */
14667754Smsmith			TAVOR_TNF_FAIL(IBT_SRQ_HDL_INVALID,
14767754Smsmith			    "invalid SRQ handle");
14867754Smsmith			goto qpalloc_fail;
14967754Smsmith		}
15067754Smsmith		srq = (tavor_srqhdl_t)attr_p->qp_ibc_srq_hdl;
15177424Smsmith	}
15277424Smsmith
15367754Smsmith	/*
15477424Smsmith	 * Check for valid QP service type (only UD/RC/UC supported)
15567754Smsmith	 */
15667754Smsmith	if (((type != IBT_UD_RQP) && (type != IBT_RC_RQP) &&
15767754Smsmith	    (type != IBT_UC_RQP))) {
15873561Smsmith		/* Set "status" and "errormsg" and goto failure */
15967754Smsmith		TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid serv type");
16067754Smsmith		goto qpalloc_fail;
16167754Smsmith	}
16267754Smsmith
16367754Smsmith	/*
164167802Sjkim	 * Only RC is supported on an SRQ -- This is a Tavor hardware
16567754Smsmith	 * limitation.  Arbel native mode will not have this shortcoming.
16667754Smsmith	 */
16799146Siwasaki	if (qp_srq_en && type != IBT_RC_RQP) {
16899146Siwasaki		/* Set "status" and "errormsg" and goto failure */
16967754Smsmith		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid serv type with SRQ");
17067754Smsmith		goto qpalloc_fail;
17167754Smsmith	}
17267754Smsmith
173138287Smarks	/*
174193267Sjkim	 * Check for valid PD handle pointer
175193267Sjkim	 */
17667754Smsmith	if (attr_p->qp_pd_hdl == NULL) {
177167802Sjkim		/* Set "status" and "errormsg" and goto failure */
178167802Sjkim		TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
179167802Sjkim		goto qpalloc_fail;
180199337Sjkim	}
181123315Snjl	pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
182123315Snjl
183167802Sjkim	/*
184123315Snjl	 * If on an SRQ, check to make sure the PD is the same
18567754Smsmith	 */
186167802Sjkim	if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) {
18767754Smsmith		/* Set "status" and "errormsg" and goto failure */
188167802Sjkim		TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
189167802Sjkim		goto qpalloc_fail;
190167802Sjkim	}
191167802Sjkim
192167802Sjkim	/* Increment the reference count on the protection domain (PD) */
193167802Sjkim	tavor_pd_refcnt_inc(pd);
194167802Sjkim
195197104Sjkim	/*
196197104Sjkim	 * Check for valid CQ handle pointers
197197104Sjkim	 */
198197104Sjkim	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
199197104Sjkim	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
200197104Sjkim		/* Set "status" and "errormsg" and goto failure */
201197104Sjkim		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
202197104Sjkim		goto qpalloc_fail1;
203197104Sjkim	}
204197104Sjkim	sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
205197104Sjkim	rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
206197104Sjkim
207197104Sjkim	/*
208197104Sjkim	 * Increment the reference count on the CQs.  One or both of these
209197104Sjkim	 * could return error if we determine that the given CQ is already
210197104Sjkim	 * being used with a special (SMI/GSI) QP.
211167802Sjkim	 */
212167802Sjkim	status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_NORMAL);
213123315Snjl	if (status != DDI_SUCCESS) {
214199337Sjkim		/* Set "status" and "errormsg" and goto failure */
215123315Snjl		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
216209746Sjkim		goto qpalloc_fail1;
217209746Sjkim	}
218209746Sjkim	status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_NORMAL);
219209746Sjkim	if (status != DDI_SUCCESS) {
220209746Sjkim		/* Set "status" and "errormsg" and goto failure */
221209746Sjkim		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
222209746Sjkim		goto qpalloc_fail2;
223209746Sjkim	}
224209746Sjkim
225209746Sjkim	/*
226167802Sjkim	 * Allocate an QP context entry.  This will be filled in with all
22767754Smsmith	 * the necessary parameters to define the Queue Pair.  Unlike
22867754Smsmith	 * other Tavor hardware resources, ownership is not immediately
229167802Sjkim	 * given to hardware in the final step here.  Instead, we must
23067754Smsmith	 * wait until the QP is later transitioned to the "Init" state before
23167754Smsmith	 * passing the QP to hardware.  If we fail here, we must undo all
232114237Snjl	 * the reference count (CQ and PD).
233209746Sjkim	 */
234209746Sjkim	status = tavor_rsrc_alloc(state, TAVOR_QPC, 1, sleepflag, &qpc);
235167802Sjkim	if (status != DDI_SUCCESS) {
23667754Smsmith		/* Set "status" and "errormsg" and goto failure */
23767754Smsmith		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP context");
238167802Sjkim		goto qpalloc_fail3;
239167802Sjkim	}
240167802Sjkim
241167802Sjkim	/*
242167802Sjkim	 * Allocate the software structure for tracking the queue pair
24367754Smsmith	 * (i.e. the Tavor Queue Pair handle).  If we fail here, we must
24467754Smsmith	 * undo the reference counts and the previous resource allocation.
24567754Smsmith	 */
24667754Smsmith	status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
24767754Smsmith	if (status != DDI_SUCCESS) {
24867754Smsmith		/* Set "status" and "errormsg" and goto failure */
24967754Smsmith		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle");
25067754Smsmith		goto qpalloc_fail4;
25167754Smsmith	}
25267754Smsmith	qp = (tavor_qphdl_t)rsrc->tr_addr;
25367754Smsmith	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
25467754Smsmith
25567754Smsmith	/*
25667754Smsmith	 * Calculate the QP number from QPC index.  This routine handles
25767754Smsmith	 * all of the operations necessary to keep track of used, unused,
25867754Smsmith	 * and released QP numbers.
25967754Smsmith	 */
26067754Smsmith	status = tavor_qp_create_qpn(state, qp, qpc);
26167754Smsmith	if (status != DDI_SUCCESS) {
26267754Smsmith		/* Set "status" and "errormsg" and goto failure */
26367754Smsmith		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QPN create");
26467754Smsmith		goto qpalloc_fail5;
26567754Smsmith	}
266151937Sjkim
26767754Smsmith	/*
26867754Smsmith	 * If this will be a user-mappable QP, then allocate an entry for
26967754Smsmith	 * the "userland resources database".  This will later be added to
27067754Smsmith	 * the database (after all further QP operations are successful).
27167754Smsmith	 * If we fail here, we must undo the reference counts and the
27267754Smsmith	 * previous resource allocation.
27391116Smsmith	 */
274167802Sjkim	if (qp_is_umap) {
27567754Smsmith		umapdb = tavor_umap_db_alloc(state->ts_instance, qp->qp_qpnum,
27667754Smsmith		    MLNX_UMAP_QPMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
27767754Smsmith		if (umapdb == NULL) {
27867754Smsmith			/* Set "status" and "errormsg" and goto failure */
27967754Smsmith			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
280167802Sjkim			goto qpalloc_fail6;
28182367Smsmith		}
28282367Smsmith	}
28367754Smsmith
28467754Smsmith	/*
28567754Smsmith	 * If this is an RC QP, then pre-allocate the maximum number of RDB
28667754Smsmith	 * entries.  This allows us to ensure that we can later cover all
28767754Smsmith	 * the resources needed by hardware for handling multiple incoming
28887031Smsmith	 * RDMA Reads.  Note: These resources are obviously not always
28967754Smsmith	 * necessary.  They are allocated here anyway.  Someday maybe this
29067754Smsmith	 * can be modified to allocate these on-the-fly (i.e. only if RDMA
29167754Smsmith	 * Read or Atomic operations are enabled) XXX
29267754Smsmith	 * If we fail here, we have a bunch of resource and reference count
29367754Smsmith	 * cleanup to do.
29499146Siwasaki	 */
29599146Siwasaki	if (type == IBT_RC_RQP) {
29699146Siwasaki		max_rdb = state->ts_cfg_profile->cp_hca_max_rdma_in_qp;
29780062Smsmith		status = tavor_rsrc_alloc(state, TAVOR_RDB, max_rdb,
29899146Siwasaki		    sleepflag, &rdb);
29999146Siwasaki		if (status != DDI_SUCCESS) {
30099146Siwasaki			/* Set "status" and "errormsg" and goto failure */
30199146Siwasaki			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed RDB");
30299146Siwasaki			goto qpalloc_fail7;
30399146Siwasaki		}
30499146Siwasaki		qp->qp_rdbrsrcp = rdb;
30599146Siwasaki		/* Calculate offset (into DDR memory) of RDB entries */
306193267Sjkim		rsrc_pool = &state->ts_rsrc_hdl[TAVOR_RDB];
307193267Sjkim		qp->qp_rdb_ddraddr = (uintptr_t)rsrc_pool->rsrc_ddr_offset +
308193267Sjkim		    (rdb->tr_indx << TAVOR_RDB_SIZE_SHIFT);
309193267Sjkim	}
31099146Siwasaki
31199146Siwasaki	/*
31299146Siwasaki	 * Calculate the appropriate size for the work queues.
31399146Siwasaki	 * Note:  All Tavor QP work queues must be a power-of-2 in size.  Also
31499146Siwasaki	 * they may not be any smaller than TAVOR_QP_MIN_SIZE.  This step is
31599146Siwasaki	 * to round the requested size up to the next highest power-of-2
31699146Siwasaki	 */
31799146Siwasaki	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
31899146Siwasaki	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
31999146Siwasaki	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
32099146Siwasaki	if ((attr_p->qp_sizes.cs_sq & (attr_p->qp_sizes.cs_sq - 1)) == 0) {
32180062Smsmith		log_qp_sq_size = log_qp_sq_size - 1;
32280062Smsmith	}
32380062Smsmith	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
324193267Sjkim	if ((attr_p->qp_sizes.cs_rq & (attr_p->qp_sizes.cs_rq - 1)) == 0) {
325193267Sjkim		log_qp_rq_size = log_qp_rq_size - 1;
32699146Siwasaki	}
32799146Siwasaki
32899146Siwasaki	/*
32999146Siwasaki	 * Next we verify that the rounded-up size is valid (i.e. consistent
33080062Smsmith	 * with the device limits and/or software-configured limits).  If not,
331193267Sjkim	 * then obviously we have a lot of cleanup to do before returning.
332193267Sjkim	 */
333167802Sjkim	if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
33480062Smsmith	    (!qp_srq_en && (log_qp_rq_size >
33599146Siwasaki	    state->ts_cfg_profile->cp_log_max_qp_sz))) {
336138287Smarks		/* Set "status" and "errormsg" and goto failure */
337138287Smarks		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size");
33899146Siwasaki		goto qpalloc_fail8;
33967754Smsmith	}
34067754Smsmith
34167754Smsmith	/*
34267754Smsmith	 * Next we verify that the requested number of SGL is valid (i.e.
34367754Smsmith	 * consistent with the device limits and/or software-configured
34467754Smsmith	 * limits).  If not, then obviously the same cleanup needs to be done.
34567754Smsmith	 */
34667754Smsmith	max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
34777424Smsmith	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
34867754Smsmith	    (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_sgl))) {
34967754Smsmith		/* Set "status" and "errormsg" and goto failure */
35077424Smsmith		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL");
35199146Siwasaki		goto qpalloc_fail8;
35267754Smsmith	}
353193267Sjkim
354193267Sjkim	/*
355193267Sjkim	 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
356193267Sjkim	 * This will depend on the requested number of SGLs.  Note: this
357193267Sjkim	 * has the side-effect of also calculating the real number of SGLs
358193267Sjkim	 * (for the calculated WQE size).
35999146Siwasaki	 *
36099146Siwasaki	 * For QP's on an SRQ, we set these to 0.
36199146Siwasaki	 */
36299146Siwasaki	if (qp_srq_en) {
36367754Smsmith		qp->qp_rq_log_wqesz = 0;
36467754Smsmith		qp->qp_rq_sgl = 0;
36599146Siwasaki	} else {
36699146Siwasaki		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
36799146Siwasaki		    TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz,
36899146Siwasaki		    &qp->qp_rq_sgl);
36967754Smsmith	}
37099679Siwasaki	tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
37199679Siwasaki	    TAVOR_QP_WQ_TYPE_SENDQ, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
37299679Siwasaki
37399679Siwasaki	/*
37467754Smsmith	 * Allocate the memory for QP work queues.  Note:  The location from
37567754Smsmith	 * which we will allocate these work queues has been passed in
37699146Siwasaki	 * through the tavor_qp_options_t structure.  Since Tavor work queues
37799146Siwasaki	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
378167802Sjkim	 * the work queue memory is very important.  We used to allocate
379167802Sjkim	 * work queues (the combined receive and send queues) so that they
380167802Sjkim	 * would be aligned on their combined size.  That alignment guaranteed
38199146Siwasaki	 * that they would never cross the 4GB boundary (Tavor work queues
38280062Smsmith	 * are on the order of MBs at maximum).  Now we are able to relax
383138287Smarks	 * this alignment constraint by ensuring that the IB address assigned
384138287Smarks	 * to the queue memory (as a result of the tavor_mr_register() call)
385138287Smarks	 * is offset from zero.
386138287Smarks	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
387114237Snjl	 * guarantee the alignment, but when attempting to use IOMMU bypass
38899146Siwasaki	 * mode we found that we were not allowed to specify any alignment
389114237Snjl	 * that was more restrictive than the system page size.
39099146Siwasaki	 * So we avoided this constraint by passing two alignment values,
39199146Siwasaki	 * one for the memory allocation itself and the other for the DMA
39267754Smsmith	 * handle (for later bind).  This used to cause more memory than
393138287Smarks	 * necessary to be allocated (in order to guarantee the more
394138287Smarks	 * restrictive alignment contraint).  But be guaranteeing the
39567754Smsmith	 * zero-based IB virtual address for the queue, we are able to
39680062Smsmith	 * conserve this memory.
39767754Smsmith	 * Note: If QP is not user-mappable, then it may come from either
39867754Smsmith	 * kernel system memory or from HCA-attached local DDR memory.
39967754Smsmith	 */
40067754Smsmith	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
40177424Smsmith	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
40267754Smsmith
403167802Sjkim	/* QP on SRQ sets these to 0 */
404167802Sjkim	if (qp_srq_en) {
405167802Sjkim		rq_wqe_size = 0;
406167802Sjkim		rq_size	    = 0;
407167802Sjkim	} else {
408167802Sjkim		rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
409167802Sjkim		rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
410167802Sjkim	}
411167802Sjkim
412167802Sjkim	qp->qp_wqinfo.qa_size = sq_size + rq_size;
413167802Sjkim	qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
414167802Sjkim	qp->qp_wqinfo.qa_bind_align  = max(sq_wqe_size, rq_wqe_size);
415167802Sjkim	if (qp_is_umap) {
416167802Sjkim		qp->qp_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
417167802Sjkim	} else {
418167802Sjkim		qp->qp_wqinfo.qa_location = wq_location;
419167802Sjkim	}
420167802Sjkim	status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
421167802Sjkim	if (status != DDI_SUCCESS) {
422167802Sjkim		/* Set "status" and "errormsg" and goto failure */
423167802Sjkim		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue");
424167802Sjkim		goto qpalloc_fail8;
425167802Sjkim	}
426167802Sjkim	if (sq_wqe_size > rq_wqe_size) {
427167802Sjkim		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
428167802Sjkim
429167802Sjkim		/*
430167802Sjkim		 * If QP's on an SRQ, we set the rq_buf to NULL
431167802Sjkim		 */
432167802Sjkim		if (qp_srq_en)
433167802Sjkim			rq_buf = NULL;
434167802Sjkim		else
435167802Sjkim			rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
436167802Sjkim	} else {
437167802Sjkim		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
438167802Sjkim		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
439167802Sjkim	}
440167802Sjkim
441167802Sjkim	/*
442167802Sjkim	 * Register the memory for the QP work queues.  The memory for the
443167802Sjkim	 * QP must be registered in the Tavor TPT tables.  This gives us the
444167802Sjkim	 * LKey to specify in the QP context later.  Note: The memory for
445167802Sjkim	 * Tavor work queues (both Send and Recv) must be contiguous and
446167802Sjkim	 * registered as a single memory region.  Note also: If the work
447167802Sjkim	 * queue is to be allocated from DDR memory, then only a "bypass"
448167802Sjkim	 * mapping is appropriate.  And if the QP memory is user-mappable,
449167802Sjkim	 * then we force DDI_DMA_CONSISTENT mapping.
450167802Sjkim	 * Also, in order to meet the alignment restriction, we pass the
451209746Sjkim	 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
452167802Sjkim	 * This guarantees that the resulting IB vaddr will be zero-based
453167802Sjkim	 * (modulo the offset into the first page).
454167802Sjkim	 * If we fail here, we still have the bunch of resource and reference
455167802Sjkim	 * count cleanup to do.
456167802Sjkim	 */
457167802Sjkim	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
458167802Sjkim	    IBT_MR_NOSLEEP;
459167802Sjkim	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
460167802Sjkim	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
461167802Sjkim	mr_attr.mr_as	    = NULL;
462167802Sjkim	mr_attr.mr_flags    = flag;
463209746Sjkim	if (qp_is_umap) {
464167802Sjkim		mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
465167802Sjkim	} else {
466167802Sjkim		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
467167802Sjkim			mr_op.mro_bind_type =
468167802Sjkim			    state->ts_cfg_profile->cp_iommu_bypass;
469167802Sjkim			dma_xfer_mode =
470167802Sjkim			    state->ts_cfg_profile->cp_streaming_consistent;
471167802Sjkim			if (dma_xfer_mode == DDI_DMA_STREAMING) {
472167802Sjkim				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
473167802Sjkim			}
474167802Sjkim		} else {
475167802Sjkim			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
476167802Sjkim		}
47767754Smsmith	}
47867754Smsmith	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
47977424Smsmith	mr_op.mro_bind_override_addr = 1;
48067754Smsmith	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
48167754Smsmith	if (status != DDI_SUCCESS) {
48267754Smsmith		/* Set "status" and "errormsg" and goto failure */
48367754Smsmith		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
48467754Smsmith		goto qpalloc_fail9;
48567754Smsmith	}
48667754Smsmith
48777424Smsmith	/*
48867754Smsmith	 * Calculate the offset between the kernel virtual address space
489151937Sjkim	 * and the IB virtual address space.  This will be used when
49067754Smsmith	 * posting work requests to properly initialize each WQE.
49167754Smsmith	 */
49267754Smsmith	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
49367754Smsmith	    (uint64_t)mr->mr_bindinfo.bi_addr;
49467754Smsmith
49567754Smsmith	/*
496167802Sjkim	 * Fill in all the return arguments (if necessary).  This includes
497167802Sjkim	 * real work queue sizes, real SGLs, and QP number
498129684Snjl	 */
49967754Smsmith	if (queuesz_p != NULL) {
500167802Sjkim		queuesz_p->cs_sq	= (1 << log_qp_sq_size);
50167754Smsmith		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
50267754Smsmith
503167802Sjkim		/* QP on an SRQ set these to 0 */
50467754Smsmith		if (qp_srq_en) {
50567754Smsmith			queuesz_p->cs_rq	= 0;
506167802Sjkim			queuesz_p->cs_rq_sgl	= 0;
507129684Snjl		} else {
508167802Sjkim			queuesz_p->cs_rq	= (1 << log_qp_rq_size);
509167802Sjkim			queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
510167802Sjkim		}
511167802Sjkim	}
51273561Smsmith	if (qpn != NULL) {
513167802Sjkim		*qpn = (ib_qpn_t)qp->qp_qpnum;
51473561Smsmith	}
51573561Smsmith
516123315Snjl	/*
517167802Sjkim	 * Fill in the rest of the Tavor Queue Pair handle.  We can update
518167802Sjkim	 * the following fields for use in further operations on the QP.
519167802Sjkim	 */
520167802Sjkim	qp->qp_qpcrsrcp		= qpc;
521167802Sjkim	qp->qp_rsrcp		= rsrc;
522123315Snjl	qp->qp_state		= TAVOR_QP_RESET;
523167802Sjkim	qp->qp_pdhdl		= pd;
52491116Smsmith	qp->qp_mrhdl		= mr;
525167802Sjkim	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
52691116Smsmith	    TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
52767754Smsmith	qp->qp_is_special	= 0;
52867754Smsmith	qp->qp_is_umap		= qp_is_umap;
529167802Sjkim	qp->qp_uarpg		= (qp->qp_is_umap) ? uarpg : 0;
530167802Sjkim	qp->qp_umap_dhp		= (devmap_cookie_t)NULL;
531167802Sjkim	qp->qp_sq_cqhdl		= sq_cq;
532167802Sjkim	qp->qp_sq_lastwqeaddr	= NULL;
533167802Sjkim	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
534167802Sjkim	qp->qp_sq_buf		= sq_buf;
535167802Sjkim	qp->qp_desc_off		= qp_desc_off;
536167802Sjkim	qp->qp_rq_cqhdl		= rq_cq;
537167802Sjkim	qp->qp_rq_lastwqeaddr	= NULL;
53867754Smsmith	qp->qp_rq_buf		= rq_buf;
539167802Sjkim
540167802Sjkim	/* QP on an SRQ sets this to 0 */
541123315Snjl	if (qp_srq_en) {
542167802Sjkim		qp->qp_rq_bufsz		= 0;
54367754Smsmith	} else {
54467754Smsmith		qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
545167802Sjkim	}
54671867Smsmith
547167802Sjkim	qp->qp_forward_sqd_event  = 0;
548167802Sjkim	qp->qp_sqd_still_draining = 0;
549123315Snjl	qp->qp_hdlrarg		= (void *)ibt_qphdl;
550167802Sjkim	qp->qp_mcg_refcnt	= 0;
551167802Sjkim
552167802Sjkim	/*
553167802Sjkim	 * If this QP is to be associated with an SRQ, then set the SRQ handle
554167802Sjkim	 * appropriately.
555167802Sjkim	 */
556167802Sjkim	if (qp_srq_en) {
557167802Sjkim		qp->qp_srqhdl = srq;
558167802Sjkim		qp->qp_srq_en = TAVOR_QP_SRQ_ENABLED;
559167802Sjkim		tavor_srq_refcnt_inc(qp->qp_srqhdl);
560167802Sjkim	} else {
561167802Sjkim		qp->qp_srqhdl = NULL;
562167802Sjkim		qp->qp_srq_en = TAVOR_QP_SRQ_DISABLED;
56367754Smsmith	}
564167802Sjkim
565167802Sjkim	/* Determine if later ddi_dma_sync will be necessary */
566167802Sjkim	qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
567167802Sjkim
568167802Sjkim	/* Determine the QP service type */
569167802Sjkim	if (type == IBT_RC_RQP) {
570167802Sjkim		qp->qp_serv_type = TAVOR_QP_RC;
571167802Sjkim	} else if (type == IBT_UD_RQP) {
572123315Snjl		qp->qp_serv_type = TAVOR_QP_UD;
573167802Sjkim	} else {
57467754Smsmith		qp->qp_serv_type = TAVOR_QP_UC;
575167802Sjkim	}
576123315Snjl
577167802Sjkim	/* Zero out the QP context */
578167802Sjkim	bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
579167802Sjkim
580167802Sjkim	/*
581167802Sjkim	 * Put QP handle in Tavor QPNum-to-QPHdl list.  Then fill in the
582167802Sjkim	 * "qphdl" and return success
583167802Sjkim	 */
584167802Sjkim	ASSERT(state->ts_qphdl[qpc->tr_indx] == NULL);
585167802Sjkim	state->ts_qphdl[qpc->tr_indx] = qp;
586167802Sjkim
587167802Sjkim	/*
588167802Sjkim	 * If this is a user-mappable QP, then we need to insert the previously
589167802Sjkim	 * allocated entry into the "userland resources database".  This will
590167802Sjkim	 * allow for later lookup during devmap() (i.e. mmap()) calls.
591167802Sjkim	 */
592167802Sjkim	if (qp_is_umap) {
593123315Snjl		tavor_umap_db_add(umapdb);
594167802Sjkim	}
595167802Sjkim
596167802Sjkim	*qphdl = qp;
597167802Sjkim
598167802Sjkim	TAVOR_TNF_EXIT(tavor_qp_alloc);
599167802Sjkim	return (DDI_SUCCESS);
600167802Sjkim
601167802Sjkim/*
602167802Sjkim * The following is cleanup for all possible failure cases in this routine
603167802Sjkim */
604167802Sjkimqpalloc_fail9:
605167802Sjkim	tavor_queue_free(state, &qp->qp_wqinfo);
606167802Sjkimqpalloc_fail8:
607167802Sjkim	if (type == IBT_RC_RQP) {
608167802Sjkim		tavor_rsrc_free(state, &rdb);
60967754Smsmith	}
61067754Smsmithqpalloc_fail7:
61167754Smsmith	if (qp_is_umap) {
612167802Sjkim		tavor_umap_db_free(umapdb);
613167802Sjkim	}
614167802Sjkimqpalloc_fail6:
615167802Sjkim	/*
616167802Sjkim	 * Releasing the QPN will also free up the QPC context.  Update
61767754Smsmith	 * the QPC context pointer to indicate this.
618167802Sjkim	 */
619167802Sjkim	tavor_qp_release_qpn(state, qp->qp_qpn_hdl, TAVOR_QPN_RELEASE);
620167802Sjkim	qpc = NULL;
621167802Sjkimqpalloc_fail5:
622167802Sjkim	tavor_rsrc_free(state, &rsrc);
623167802Sjkimqpalloc_fail4:
624167802Sjkim	if (qpc) {
625167802Sjkim		tavor_rsrc_free(state, &qpc);
626167802Sjkim	}
627167802Sjkimqpalloc_fail3:
62867754Smsmith	tavor_cq_refcnt_dec(rq_cq);
629167802Sjkimqpalloc_fail2:
63077424Smsmith	tavor_cq_refcnt_dec(sq_cq);
631167802Sjkimqpalloc_fail1:
632167802Sjkim	tavor_pd_refcnt_dec(pd);
63399679Siwasakiqpalloc_fail:
634167802Sjkim	TNF_PROBE_1(tavor_qp_alloc_fail, TAVOR_TNF_ERROR, "",
635167802Sjkim	    tnf_string, msg, errormsg);
636167802Sjkim	TAVOR_TNF_EXIT(tavor_qp_alloc);
63767754Smsmith	return (status);
638129684Snjl}
639167802Sjkim
640167802Sjkim
641167802Sjkim
64271867Smsmith/*
643167802Sjkim * tavor_special_qp_alloc()
64467754Smsmith *    Context: Can be called only from user or kernel context.
645167802Sjkim */
646167802Sjkimint
647167802Sjkimtavor_special_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
648167802Sjkim    uint_t sleepflag, tavor_qp_options_t *op)
649129684Snjl{
65067754Smsmith	tavor_rsrc_t		*qpc, *rsrc;
651167802Sjkim	tavor_qphdl_t		qp;
652140216Snjl	ibt_qp_alloc_attr_t	*attr_p;
653167802Sjkim	ibt_sqp_type_t		type;
654140216Snjl	uint8_t			port;
655167802Sjkim	ibtl_qp_hdl_t		ibt_qphdl;
656167802Sjkim	ibt_chan_sizes_t	*queuesz_p;
657167802Sjkim	tavor_qphdl_t		*qphdl;
658167802Sjkim	ibt_mr_attr_t		mr_attr;
65999679Siwasaki	tavor_mr_options_t	mr_op;
66099679Siwasaki	tavor_pdhdl_t		pd;
661167802Sjkim	tavor_cqhdl_t		sq_cq, rq_cq;
66299679Siwasaki	tavor_mrhdl_t		mr;
66399679Siwasaki	uint64_t		qp_desc_off;
66499679Siwasaki	uint32_t		*sq_buf, *rq_buf;
66567754Smsmith	uint32_t		log_qp_sq_size, log_qp_rq_size;
666	uint32_t		sq_size, rq_size, max_sgl;
667	uint32_t		sq_wqe_size, rq_wqe_size;
668	uint_t			wq_location, dma_xfer_mode;
669	int			status, flag;
670	char			*errormsg;
671
672	TAVOR_TNF_ENTER(tavor_special_qp_alloc);
673
674	/*
675	 * Check the "options" flag.  Currently this flag tells the driver
676	 * whether or not the QP's work queues should be come from normal
677	 * system memory or whether they should be allocated from DDR memory.
678	 */
679	if (op == NULL) {
680		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
681	} else {
682		wq_location = op->qpo_wq_loc;
683	}
684
685	/*
686	 * Extract the necessary info from the tavor_qp_info_t structure
687	 */
688	attr_p	  = qpinfo->qpi_attrp;
689	type	  = qpinfo->qpi_type;
690	port	  = qpinfo->qpi_port;
691	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
692	queuesz_p = qpinfo->qpi_queueszp;
693	qphdl	  = &qpinfo->qpi_qphdl;
694
695	/*
696	 * Check for valid special QP type (only SMI & GSI supported)
697	 */
698	if ((type != IBT_SMI_SQP) && (type != IBT_GSI_SQP)) {
699		/* Set "status" and "errormsg" and goto failure */
700		TAVOR_TNF_FAIL(IBT_QP_SPECIAL_TYPE_INVALID, "invalid QP type");
701		goto spec_qpalloc_fail;
702	}
703
704	/*
705	 * Check for valid port number
706	 */
707	if (!tavor_portnum_is_valid(state, port)) {
708		/* Set "status" and "errormsg" and goto failure */
709		TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
710		goto spec_qpalloc_fail;
711	}
712	port = port - 1;
713
714	/*
715	 * Check for valid PD handle pointer
716	 */
717	if (attr_p->qp_pd_hdl == NULL) {
718		/* Set "status" and "errormsg" and goto failure */
719		TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
720		goto spec_qpalloc_fail;
721	}
722	pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
723
724	/* Increment the reference count on the PD */
725	tavor_pd_refcnt_inc(pd);
726
727	/*
728	 * Check for valid CQ handle pointers
729	 */
730	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
731	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
732		/* Set "status" and "errormsg" and goto failure */
733		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
734		goto spec_qpalloc_fail1;
735	}
736	sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
737	rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
738
739	/*
740	 * Increment the reference count on the CQs.  One or both of these
741	 * could return error if we determine that the given CQ is already
742	 * being used with a non-special QP (i.e. a normal QP).
743	 */
744	status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_SPECIAL);
745	if (status != DDI_SUCCESS) {
746		/* Set "status" and "errormsg" and goto failure */
747		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
748		goto spec_qpalloc_fail1;
749	}
750	status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_SPECIAL);
751	if (status != DDI_SUCCESS) {
752		/* Set "status" and "errormsg" and goto failure */
753		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
754		goto spec_qpalloc_fail2;
755	}
756
757	/*
758	 * Allocate the special QP resources.  Essentially, this allocation
759	 * amounts to checking if the request special QP has already been
760	 * allocated.  If successful, the QP context return is an actual
761	 * QP context that has been "aliased" to act as a special QP of the
762	 * appropriate type (and for the appropriate port).  Just as in
763	 * tavor_qp_alloc() above, ownership for this QP context is not
764	 * immediately given to hardware in the final step here.  Instead, we
765	 * wait until the QP is later transitioned to the "Init" state before
766	 * passing the QP to hardware.  If we fail here, we must undo all
767	 * the reference count (CQ and PD).
768	 */
769	status = tavor_special_qp_rsrc_alloc(state, type, port, &qpc);
770	if (status != DDI_SUCCESS) {
771		/* Set "status" and "errormsg" and goto failure */
772		TAVOR_TNF_FAIL(status, "failed special QP rsrc");
773		goto spec_qpalloc_fail3;
774	}
775
776	/*
777	 * Allocate the software structure for tracking the special queue
778	 * pair (i.e. the Tavor Queue Pair handle).  If we fail here, we
779	 * must undo the reference counts and the previous resource allocation.
780	 */
781	status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
782	if (status != DDI_SUCCESS) {
783		/* Set "status" and "errormsg" and goto failure */
784		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle");
785		goto spec_qpalloc_fail4;
786	}
787	qp = (tavor_qphdl_t)rsrc->tr_addr;
788	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
789
790	/*
791	 * Actual QP number is a combination of the index of the QPC and
792	 * the port number.  This is because the special QP contexts must
793	 * be allocated two-at-a-time.
794	 */
795	qp->qp_qpnum = qpc->tr_indx + port;
796
797	/*
798	 * Calculate the appropriate size for the work queues.
799	 * Note:  All Tavor QP work queues must be a power-of-2 in size.  Also
800	 * they may not be any smaller than TAVOR_QP_MIN_SIZE.  This step is
801	 * to round the requested size up to the next highest power-of-2
802	 */
803	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
804	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
805	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
806	if ((attr_p->qp_sizes.cs_sq & (attr_p->qp_sizes.cs_sq - 1)) == 0) {
807		log_qp_sq_size = log_qp_sq_size - 1;
808	}
809	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
810	if ((attr_p->qp_sizes.cs_rq & (attr_p->qp_sizes.cs_rq - 1)) == 0) {
811		log_qp_rq_size = log_qp_rq_size - 1;
812	}
813
814	/*
815	 * Next we verify that the rounded-up size is valid (i.e. consistent
816	 * with the device limits and/or software-configured limits).  If not,
817	 * then obviously we have a bit of cleanup to do before returning.
818	 */
819	if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
820	    (log_qp_rq_size > state->ts_cfg_profile->cp_log_max_qp_sz)) {
821		/* Set "status" and "errormsg" and goto failure */
822		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size");
823		goto spec_qpalloc_fail5;
824	}
825
826	/*
827	 * Next we verify that the requested number of SGL is valid (i.e.
828	 * consistent with the device limits and/or software-configured
829	 * limits).  If not, then obviously the same cleanup needs to be done.
830	 */
831	max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
832	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
833	    (attr_p->qp_sizes.cs_rq_sgl > max_sgl)) {
834		/* Set "status" and "errormsg" and goto failure */
835		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL");
836		goto spec_qpalloc_fail5;
837	}
838
839	/*
840	 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
841	 * This will depend on the requested number of SGLs.  Note: this
842	 * has the side-effect of also calculating the real number of SGLs
843	 * (for the calculated WQE size).
844	 */
845	tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
846	    TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
847	if (type == IBT_SMI_SQP) {
848		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
849		    TAVOR_QP_WQ_TYPE_SENDMLX_QP0, &qp->qp_sq_log_wqesz,
850		    &qp->qp_sq_sgl);
851	} else {
852		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
853		    TAVOR_QP_WQ_TYPE_SENDMLX_QP1, &qp->qp_sq_log_wqesz,
854		    &qp->qp_sq_sgl);
855	}
856
857	/*
858	 * Allocate the memory for QP work queues.  Note:  The location from
859	 * which we will allocate these work queues has been passed in
860	 * through the tavor_qp_options_t structure.  Since Tavor work queues
861	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
862	 * the work queue memory is very important.  We used to allocate
863	 * work queues (the combined receive and send queues) so that they
864	 * would be aligned on their combined size.  That alignment guaranteed
865	 * that they would never cross the 4GB boundary (Tavor work queues
866	 * are on the order of MBs at maximum).  Now we are able to relax
867	 * this alignment constraint by ensuring that the IB address assigned
868	 * to the queue memory (as a result of the tavor_mr_register() call)
869	 * is offset from zero.
870	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
871	 * guarantee the alignment, but when attempting to use IOMMU bypass
872	 * mode we found that we were not allowed to specify any alignment
873	 * that was more restrictive than the system page size.
874	 * So we avoided this constraint by passing two alignment values,
875	 * one for the memory allocation itself and the other for the DMA
876	 * handle (for later bind).  This used to cause more memory than
877	 * necessary to be allocated (in order to guarantee the more
878	 * restrictive alignment contraint).  But be guaranteeing the
879	 * zero-based IB virtual address for the queue, we are able to
880	 * conserve this memory.
881	 */
882	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
883	rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
884	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
885	rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
886	qp->qp_wqinfo.qa_size	  = sq_size + rq_size;
887	qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
888	qp->qp_wqinfo.qa_bind_align  = max(sq_wqe_size, rq_wqe_size);
889	qp->qp_wqinfo.qa_location = wq_location;
890	status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
891	if (status != NULL) {
892		/* Set "status" and "errormsg" and goto failure */
893		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue");
894		goto spec_qpalloc_fail5;
895	}
896	if (sq_wqe_size > rq_wqe_size) {
897		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
898		rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
899	} else {
900		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
901		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
902	}
903
904	/*
905	 * Register the memory for the special QP work queues.  The memory for
906	 * the special QP must be registered in the Tavor TPT tables.  This
907	 * gives us the LKey to specify in the QP context later.  Note: The
908	 * memory for Tavor work queues (both Send and Recv) must be contiguous
909	 * and registered as a single memory region.  Note also: If the work
910	 * queue is to be allocated from DDR memory, then only a "bypass"
911	 * mapping is appropriate.
912	 * Also, in order to meet the alignment restriction, we pass the
913	 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
914	 * This guarantees that the resulting IB vaddr will be zero-based
915	 * (modulo the offset into the first page).
916	 * If we fail here, we have a bunch of resource and reference count
917	 * cleanup to do.
918	 */
919	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
920	    IBT_MR_NOSLEEP;
921	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
922	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
923	mr_attr.mr_as	    = NULL;
924	mr_attr.mr_flags    = flag;
925	if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
926		mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
927
928		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
929		if (dma_xfer_mode == DDI_DMA_STREAMING) {
930			mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
931		}
932	} else {
933		mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
934	}
935	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
936	mr_op.mro_bind_override_addr = 1;
937	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
938	if (status != DDI_SUCCESS) {
939		/* Set "status" and "errormsg" and goto failure */
940		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
941		goto spec_qpalloc_fail6;
942	}
943
944	/*
945	 * Calculate the offset between the kernel virtual address space
946	 * and the IB virtual address space.  This will be used when
947	 * posting work requests to properly initialize each WQE.
948	 */
949	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
950	    (uint64_t)mr->mr_bindinfo.bi_addr;
951
952	/*
953	 * Fill in all the return arguments (if necessary).  This includes
954	 * real work queue sizes, real SGLs, and QP number (which will be
955	 * either zero or one, depending on the special QP type)
956	 */
957	if (queuesz_p != NULL) {
958		queuesz_p->cs_sq	= (1 << log_qp_sq_size);
959		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
960		queuesz_p->cs_rq	= (1 << log_qp_rq_size);
961		queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
962	}
963
964	/*
965	 * Fill in the rest of the Tavor Queue Pair handle.  We can update
966	 * the following fields for use in further operations on the QP.
967	 */
968	qp->qp_qpcrsrcp		= qpc;
969	qp->qp_rsrcp		= rsrc;
970	qp->qp_state		= TAVOR_QP_RESET;
971	qp->qp_pdhdl		= pd;
972	qp->qp_mrhdl		= mr;
973	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
974	    TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
975	qp->qp_is_special	= (type == IBT_SMI_SQP) ?
976	    TAVOR_QP_SMI : TAVOR_QP_GSI;
977	qp->qp_is_umap		= 0;
978	qp->qp_uarpg		= 0;
979	qp->qp_sq_cqhdl		= sq_cq;
980	qp->qp_sq_lastwqeaddr	= NULL;
981	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
982	qp->qp_sq_buf		= sq_buf;
983	qp->qp_desc_off		= qp_desc_off;
984	qp->qp_rq_cqhdl		= rq_cq;
985	qp->qp_rq_lastwqeaddr	= NULL;
986	qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
987	qp->qp_rq_buf		= rq_buf;
988	qp->qp_portnum		= port;
989	qp->qp_pkeyindx		= 0;
990	qp->qp_hdlrarg		= (void *)ibt_qphdl;
991	qp->qp_mcg_refcnt	= 0;
992	qp->qp_srq_en		= 0;
993	qp->qp_srqhdl		= NULL;
994
995	/* Determine if later ddi_dma_sync will be necessary */
996	qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
997
998	/* All special QPs are UD QP service type */
999	qp->qp_serv_type = TAVOR_QP_UD;
1000
1001	/* Zero out the QP context */
1002	bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
1003
1004	/*
1005	 * Put QP handle in Tavor QPNum-to-QPHdl list.  Then fill in the
1006	 * "qphdl" and return success
1007	 */
1008	ASSERT(state->ts_qphdl[qpc->tr_indx + port] == NULL);
1009	state->ts_qphdl[qpc->tr_indx + port] = qp;
1010
1011	*qphdl = qp;
1012
1013	TAVOR_TNF_EXIT(tavor_special_qp_alloc);
1014	return (DDI_SUCCESS);
1015
1016/*
1017 * The following is cleanup for all possible failure cases in this routine
1018 */
1019spec_qpalloc_fail6:
1020	tavor_queue_free(state, &qp->qp_wqinfo);
1021spec_qpalloc_fail5:
1022	tavor_rsrc_free(state, &rsrc);
1023spec_qpalloc_fail4:
1024	if (tavor_special_qp_rsrc_free(state, type, port) != DDI_SUCCESS) {
1025		TAVOR_WARNING(state, "failed to free special QP rsrc");
1026	}
1027spec_qpalloc_fail3:
1028	tavor_cq_refcnt_dec(rq_cq);
1029spec_qpalloc_fail2:
1030	tavor_cq_refcnt_dec(sq_cq);
1031spec_qpalloc_fail1:
1032	tavor_pd_refcnt_dec(pd);
1033spec_qpalloc_fail:
1034	TNF_PROBE_1(tavor_special_qp_alloc_fail, TAVOR_TNF_ERROR, "",
1035	    tnf_string, msg, errormsg);
1036	TAVOR_TNF_EXIT(tavor_special_qp_alloc);
1037	return (status);
1038}
1039
1040
1041/*
1042 * tavor_qp_free()
1043 *    This function frees up the QP resources.  Depending on the value
1044 *    of the "free_qp_flags", the QP number may not be released until
1045 *    a subsequent call to tavor_qp_release_qpn().
1046 *
1047 *    Context: Can be called only from user or kernel context.
1048 */
1049/* ARGSUSED */
1050int
1051tavor_qp_free(tavor_state_t *state, tavor_qphdl_t *qphdl,
1052    ibc_free_qp_flags_t free_qp_flags, ibc_qpn_hdl_t *qpnh,
1053    uint_t sleepflag)
1054{
1055	tavor_rsrc_t		*qpc, *rdb, *rsrc;
1056	tavor_umap_db_entry_t	*umapdb;
1057	tavor_qpn_entry_t	*entry;
1058	tavor_pdhdl_t		pd;
1059	tavor_mrhdl_t		mr;
1060	tavor_cqhdl_t		sq_cq, rq_cq;
1061	tavor_srqhdl_t		srq;
1062	tavor_qphdl_t		qp;
1063	uint64_t		value;
1064	uint_t			type, port;
1065	uint_t			maxprot;
1066	uint_t			qp_srq_en;
1067	int			status;
1068	char			*errormsg;
1069
1070	TAVOR_TNF_ENTER(tavor_qp_free);
1071
1072	/*
1073	 * Pull all the necessary information from the Tavor Queue Pair
1074	 * handle.  This is necessary here because the resource for the
1075	 * QP handle is going to be freed up as part of this operation.
1076	 */
1077	qp	= *qphdl;
1078	mutex_enter(&qp->qp_lock);
1079	qpc	= qp->qp_qpcrsrcp;
1080	rsrc	= qp->qp_rsrcp;
1081	pd	= qp->qp_pdhdl;
1082	srq	= qp->qp_srqhdl;
1083	mr	= qp->qp_mrhdl;
1084	rq_cq	= qp->qp_rq_cqhdl;
1085	sq_cq	= qp->qp_sq_cqhdl;
1086	rdb	= qp->qp_rdbrsrcp;
1087	port	= qp->qp_portnum;
1088	qp_srq_en = qp->qp_srq_en;
1089
1090	/*
1091	 * If the QP is part of an MCG, then we fail the qp_free
1092	 */
1093	if (qp->qp_mcg_refcnt != 0) {
1094		mutex_exit(&qp->qp_lock);
1095		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "QP part of MCG on free");
1096		goto qpfree_fail;
1097	}
1098
1099	/*
1100	 * If the QP is not already in "Reset" state, then transition to
1101	 * "Reset".  This is necessary because software does not reclaim
1102	 * ownership of the QP context until the QP is in the "Reset" state.
1103	 * If the ownership transfer fails for any reason, then it is an
1104	 * indication that something (either in HW or SW) has gone seriously
1105	 * wrong.  So we print a warning message and return.
1106	 */
1107	if (qp->qp_state != TAVOR_QP_RESET) {
1108		if (tavor_qp_to_reset(state, qp) != DDI_SUCCESS) {
1109			mutex_exit(&qp->qp_lock);
1110			TAVOR_WARNING(state, "failed to reset QP context");
1111			/* Set "status" and "errormsg" and goto failure */
1112			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1113			    "reset QP context");
1114			goto qpfree_fail;
1115		}
1116		qp->qp_state = TAVOR_QP_RESET;
1117
1118		/*
1119		 * Do any additional handling necessary for the transition
1120		 * to the "Reset" state (e.g. update the WRID lists)
1121		 */
1122		tavor_wrid_to_reset_handling(state, qp);
1123	}
1124
1125	/*
1126	 * If this was a user-mappable QP, then we need to remove its entry
1127	 * from the "userland resources database".  If it is also currently
1128	 * mmap()'d out to a user process, then we need to call
1129	 * devmap_devmem_remap() to remap the QP memory to an invalid mapping.
1130	 * We also need to invalidate the QP tracking information for the
1131	 * user mapping.
1132	 */
1133	if (qp->qp_is_umap) {
1134		status = tavor_umap_db_find(state->ts_instance, qp->qp_qpnum,
1135		    MLNX_UMAP_QPMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
1136		    &umapdb);
1137		if (status != DDI_SUCCESS) {
1138			mutex_exit(&qp->qp_lock);
1139			TAVOR_WARNING(state, "failed to find in database");
1140			TAVOR_TNF_EXIT(tavor_qp_free);
1141			return (ibc_get_ci_failure(0));
1142		}
1143		tavor_umap_db_free(umapdb);
1144		if (qp->qp_umap_dhp != NULL) {
1145			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
1146			status = devmap_devmem_remap(qp->qp_umap_dhp,
1147			    state->ts_dip, 0, 0, qp->qp_wqinfo.qa_size,
1148			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
1149			if (status != DDI_SUCCESS) {
1150				mutex_exit(&qp->qp_lock);
1151				TAVOR_WARNING(state, "failed in QP memory "
1152				    "devmap_devmem_remap()");
1153				TAVOR_TNF_EXIT(tavor_qp_free);
1154				return (ibc_get_ci_failure(0));
1155			}
1156			qp->qp_umap_dhp = (devmap_cookie_t)NULL;
1157		}
1158	}
1159
1160	/*
1161	 * Put NULL into the Tavor QPNum-to-QPHdl list.  This will allow any
1162	 * in-progress events to detect that the QP corresponding to this
1163	 * number has been freed.  Note: it does depend in whether we are
1164	 * freeing a special QP or not.
1165	 */
1166	if (qp->qp_is_special) {
1167		state->ts_qphdl[qpc->tr_indx + port] = NULL;
1168	} else {
1169		state->ts_qphdl[qpc->tr_indx] = NULL;
1170	}
1171
1172	/*
1173	 * Drop the QP lock
1174	 *    At this point the lock is no longer necessary.  We cannot
1175	 *    protect from multiple simultaneous calls to free the same QP.
1176	 *    In addition, since the QP lock is contained in the QP "software
1177	 *    handle" resource, which we will free (see below), it is
1178	 *    important that we have no further references to that memory.
1179	 */
1180	mutex_exit(&qp->qp_lock);
1181	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
1182
1183	/*
1184	 * Free the QP resources
1185	 *    Start by deregistering and freeing the memory for work queues.
1186	 *    Next free any previously allocated context information
1187	 *    (depending on QP type)
1188	 *    Finally, decrement the necessary reference counts.
1189	 * If this fails for any reason, then it is an indication that
1190	 * something (either in HW or SW) has gone seriously wrong.  So we
1191	 * print a warning message and return.
1192	 */
1193	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
1194	    sleepflag);
1195	if (status != DDI_SUCCESS) {
1196		TAVOR_WARNING(state, "failed to deregister QP memory");
1197		/* Set "status" and "errormsg" and goto failure */
1198		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed deregister mr");
1199		goto qpfree_fail;
1200	}
1201
1202	/* Free the memory for the QP */
1203	tavor_queue_free(state, &qp->qp_wqinfo);
1204
1205	/*
1206	 * Free up the remainder of the QP resources.  Note: we have a few
1207	 * different resources to free up depending on whether the QP is a
1208	 * special QP or not.  As described above, if any of these fail for
1209	 * any reason it is an indication that something (either in HW or SW)
1210	 * has gone seriously wrong.  So we print a warning message and
1211	 * return.
1212	 */
1213	if (qp->qp_is_special) {
1214		type = (qp->qp_is_special == TAVOR_QP_SMI) ?
1215		    IBT_SMI_SQP : IBT_GSI_SQP;
1216
1217		/* Free up resources for the special QP */
1218		status = tavor_special_qp_rsrc_free(state, type, port);
1219		if (status != DDI_SUCCESS) {
1220			TAVOR_WARNING(state, "failed to free special QP rsrc");
1221			/* Set "status" and "errormsg" and goto failure */
1222			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1223			    "failed special QP rsrc");
1224			goto qpfree_fail;
1225		}
1226
1227	} else {
1228		type = qp->qp_serv_type;
1229
1230		/* Free up the RDB entries resource */
1231		if (type == TAVOR_QP_RC) {
1232			tavor_rsrc_free(state, &rdb);
1233		}
1234
1235		/*
1236		 * Check the flags and determine whether to release the
1237		 * QPN or not, based on their value.
1238		 */
1239		if (free_qp_flags == IBC_FREE_QP_ONLY) {
1240			entry = qp->qp_qpn_hdl;
1241			tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1242			    TAVOR_QPN_FREE_ONLY);
1243			*qpnh = (ibc_qpn_hdl_t)entry;
1244		} else {
1245			tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1246			    TAVOR_QPN_RELEASE);
1247		}
1248	}
1249
1250	/* Free the Tavor Queue Pair handle */
1251	tavor_rsrc_free(state, &rsrc);
1252
1253	/* Decrement the reference counts on CQs, PD and SRQ (if needed) */
1254	tavor_cq_refcnt_dec(rq_cq);
1255	tavor_cq_refcnt_dec(sq_cq);
1256	tavor_pd_refcnt_dec(pd);
1257	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
1258		tavor_srq_refcnt_dec(srq);
1259	}
1260
1261	/* Set the qphdl pointer to NULL and return success */
1262	*qphdl = NULL;
1263
1264	TAVOR_TNF_EXIT(tavor_qp_free);
1265	return (DDI_SUCCESS);
1266
1267qpfree_fail:
1268	TNF_PROBE_1(tavor_qp_free_fail, TAVOR_TNF_ERROR, "",
1269	    tnf_string, msg, errormsg);
1270	TAVOR_TNF_EXIT(tavor_qp_free);
1271	return (status);
1272}
1273
1274
1275/*
1276 * tavor_qp_query()
1277 *    Context: Can be called from interrupt or base context.
1278 */
1279int
1280tavor_qp_query(tavor_state_t *state, tavor_qphdl_t qp,
1281    ibt_qp_query_attr_t *attr_p)
1282{
1283	ibt_cep_state_t		qp_state;
1284	ibt_qp_ud_attr_t	*ud;
1285	ibt_qp_rc_attr_t	*rc;
1286	ibt_qp_uc_attr_t	*uc;
1287	ibt_cep_flags_t		enable_flags;
1288	tavor_hw_addr_path_t	*qpc_path, *qpc_alt_path;
1289	ibt_cep_path_t		*path_ptr, *alt_path_ptr;
1290	tavor_hw_qpc_t		*qpc;
1291	int			status;
1292
1293	TAVOR_TNF_ENTER(tavor_qp_query);
1294
1295	mutex_enter(&qp->qp_lock);
1296
1297	/*
1298	 * Grab the temporary QPC entry from QP software state
1299	 */
1300	qpc = &qp->qpc;
1301
1302	/* Convert the current Tavor QP state to IBTF QP state */
1303	switch (qp->qp_state) {
1304	case TAVOR_QP_RESET:
1305		qp_state = IBT_STATE_RESET;		/* "Reset" */
1306		break;
1307	case TAVOR_QP_INIT:
1308		qp_state = IBT_STATE_INIT;		/* Initialized */
1309		break;
1310	case TAVOR_QP_RTR:
1311		qp_state = IBT_STATE_RTR;		/* Ready to Receive */
1312		break;
1313	case TAVOR_QP_RTS:
1314		qp_state = IBT_STATE_RTS;		/* Ready to Send */
1315		break;
1316	case TAVOR_QP_SQERR:
1317		qp_state = IBT_STATE_SQE;		/* Send Queue Error */
1318		break;
1319	case TAVOR_QP_SQD:
1320		if (qp->qp_sqd_still_draining) {
1321			qp_state = IBT_STATE_SQDRAIN;	/* SQ Draining */
1322		} else {
1323			qp_state = IBT_STATE_SQD;	/* SQ Drained */
1324		}
1325		break;
1326	case TAVOR_QP_ERR:
1327		qp_state = IBT_STATE_ERROR;		/* Error */
1328		break;
1329	default:
1330		mutex_exit(&qp->qp_lock);
1331		TNF_PROBE_1(tavor_qp_query_inv_qpstate_fail,
1332		    TAVOR_TNF_ERROR, "", tnf_uint, qpstate, qp->qp_state);
1333		TAVOR_TNF_EXIT(tavor_qp_query);
1334		return (ibc_get_ci_failure(0));
1335	}
1336	attr_p->qp_info.qp_state = qp_state;
1337
1338	/* SRQ Hook. */
1339	attr_p->qp_srq = NULL;
1340
1341	/*
1342	 * The following QP information is always returned, regardless of
1343	 * the current QP state.  Note: Some special handling is necessary
1344	 * for calculating the QP number on special QP (QP0 and QP1).
1345	 */
1346	attr_p->qp_sq_cq    = qp->qp_sq_cqhdl->cq_hdlrarg;
1347	attr_p->qp_rq_cq    = qp->qp_rq_cqhdl->cq_hdlrarg;
1348	if (qp->qp_is_special) {
1349		attr_p->qp_qpn = (qp->qp_is_special == TAVOR_QP_SMI) ? 0 : 1;
1350	} else {
1351		attr_p->qp_qpn = (ib_qpn_t)qp->qp_qpnum;
1352	}
1353	attr_p->qp_sq_sgl   = qp->qp_sq_sgl;
1354	attr_p->qp_rq_sgl   = qp->qp_rq_sgl;
1355	attr_p->qp_info.qp_sq_sz = qp->qp_sq_bufsz;
1356	attr_p->qp_info.qp_rq_sz = qp->qp_rq_bufsz;
1357
1358	/*
1359	 * If QP is currently in the "Reset" state, then only the above are
1360	 * returned
1361	 */
1362	if (qp_state == IBT_STATE_RESET) {
1363		mutex_exit(&qp->qp_lock);
1364		TAVOR_TNF_EXIT(tavor_qp_query);
1365		return (DDI_SUCCESS);
1366	}
1367
1368	/*
1369	 * Post QUERY_QP command to firmware
1370	 *
1371	 * We do a TAVOR_NOSLEEP here because we are holding the "qp_lock".
1372	 * Since we may be in the interrupt context (or subsequently raised
1373	 * to interrupt level by priority inversion), we do not want to block
1374	 * in this routine waiting for success.
1375	 */
1376	status = tavor_cmn_query_cmd_post(state, QUERY_QP, qp->qp_qpnum,
1377	    qpc, sizeof (tavor_hw_qpc_t), TAVOR_CMD_NOSLEEP_SPIN);
1378	if (status != TAVOR_CMD_SUCCESS) {
1379		mutex_exit(&qp->qp_lock);
1380		cmn_err(CE_CONT, "Tavor: QUERY_QP command failed: %08x\n",
1381		    status);
1382		TNF_PROBE_1(tavor_qp_query_cmd_fail, TAVOR_TNF_ERROR, "",
1383		    tnf_uint, status, status);
1384		TAVOR_TNF_EXIT(tavor_qp_query);
1385		return (ibc_get_ci_failure(0));
1386	}
1387
1388	/*
1389	 * Fill in the additional QP info based on the QP's transport type.
1390	 */
1391	if (qp->qp_serv_type == TAVOR_QP_UD) {
1392
1393		/* Fill in the UD-specific info */
1394		ud = &attr_p->qp_info.qp_transport.ud;
1395		ud->ud_qkey	= (ib_qkey_t)qpc->qkey;
1396		ud->ud_sq_psn	= qpc->next_snd_psn;
1397		ud->ud_pkey_ix	= qpc->pri_addr_path.pkey_indx;
1398		ud->ud_port	= qpc->pri_addr_path.portnum;
1399
1400		attr_p->qp_info.qp_trans = IBT_UD_SRV;
1401
1402	} else if (qp->qp_serv_type == TAVOR_QP_RC) {
1403
1404		/* Fill in the RC-specific info */
1405		rc = &attr_p->qp_info.qp_transport.rc;
1406		rc->rc_sq_psn	= qpc->next_snd_psn;
1407		rc->rc_rq_psn	= qpc->next_rcv_psn;
1408		rc->rc_dst_qpn	= qpc->rem_qpn;
1409
1410		/* Grab the path migration state information */
1411		if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1412			rc->rc_mig_state = IBT_STATE_MIGRATED;
1413		} else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1414			rc->rc_mig_state = IBT_STATE_REARMED;
1415		} else {
1416			rc->rc_mig_state = IBT_STATE_ARMED;
1417		}
1418		rc->rc_rdma_ra_out = (1 << qpc->sra_max);
1419		rc->rc_rdma_ra_in  = (1 << qpc->rra_max);
1420		rc->rc_min_rnr_nak = qpc->min_rnr_nak;
1421		rc->rc_path_mtu	   = qpc->mtu;
1422		rc->rc_retry_cnt   = qpc->retry_cnt;
1423
1424		/* Get the common primary address path fields */
1425		qpc_path = &qpc->pri_addr_path;
1426		path_ptr = &rc->rc_path;
1427		tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1428		    TAVOR_ADDRPATH_QP, qp);
1429
1430		/* Fill in the additional primary address path fields */
1431		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
1432		path_ptr->cep_hca_port_num = qpc_path->portnum;
1433		path_ptr->cep_timeout	   = qpc_path->ack_timeout;
1434
1435		/* Get the common alternate address path fields */
1436		qpc_alt_path = &qpc->alt_addr_path;
1437		alt_path_ptr = &rc->rc_alt_path;
1438		tavor_get_addr_path(state, qpc_alt_path,
1439		    &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1440
1441		/* Fill in the additional alternate address path fields */
1442		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
1443		alt_path_ptr->cep_hca_port_num	= qpc_alt_path->portnum;
1444		alt_path_ptr->cep_timeout	= qpc_alt_path->ack_timeout;
1445
1446		/* Get the RNR retry time from primary path */
1447		rc->rc_rnr_retry_cnt = qpc_path->rnr_retry;
1448
1449		/* Set the enable flags based on RDMA/Atomic enable bits */
1450		enable_flags = IBT_CEP_NO_FLAGS;
1451		enable_flags |= ((qpc->rre == 0) ? 0 : IBT_CEP_RDMA_RD);
1452		enable_flags |= ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1453		enable_flags |= ((qpc->rae == 0) ? 0 : IBT_CEP_ATOMIC);
1454		attr_p->qp_info.qp_flags = enable_flags;
1455
1456		attr_p->qp_info.qp_trans = IBT_RC_SRV;
1457
1458	} else if (qp->qp_serv_type == TAVOR_QP_UC) {
1459
1460		/* Fill in the UC-specific info */
1461		uc = &attr_p->qp_info.qp_transport.uc;
1462		uc->uc_sq_psn	= qpc->next_snd_psn;
1463		uc->uc_rq_psn	= qpc->next_rcv_psn;
1464		uc->uc_dst_qpn	= qpc->rem_qpn;
1465
1466		/* Grab the path migration state information */
1467		if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1468			uc->uc_mig_state = IBT_STATE_MIGRATED;
1469		} else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1470			uc->uc_mig_state = IBT_STATE_REARMED;
1471		} else {
1472			uc->uc_mig_state = IBT_STATE_ARMED;
1473		}
1474		uc->uc_path_mtu = qpc->mtu;
1475
1476		/* Get the common primary address path fields */
1477		qpc_path = &qpc->pri_addr_path;
1478		path_ptr = &uc->uc_path;
1479		tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1480		    TAVOR_ADDRPATH_QP, qp);
1481
1482		/* Fill in the additional primary address path fields */
1483		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
1484		path_ptr->cep_hca_port_num = qpc_path->portnum;
1485
1486		/* Get the common alternate address path fields */
1487		qpc_alt_path = &qpc->alt_addr_path;
1488		alt_path_ptr = &uc->uc_alt_path;
1489		tavor_get_addr_path(state, qpc_alt_path,
1490		    &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1491
1492		/* Fill in the additional alternate address path fields */
1493		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
1494		alt_path_ptr->cep_hca_port_num	= qpc_alt_path->portnum;
1495
1496		/*
1497		 * Set the enable flags based on RDMA enable bits (by
1498		 * definition UC doesn't support Atomic or RDMA Read)
1499		 */
1500		enable_flags = ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1501		attr_p->qp_info.qp_flags = enable_flags;
1502
1503		attr_p->qp_info.qp_trans = IBT_UC_SRV;
1504
1505	} else {
1506		TAVOR_WARNING(state, "unexpected QP transport type");
1507		mutex_exit(&qp->qp_lock);
1508		return (ibc_get_ci_failure(0));
1509	}
1510
1511	/*
1512	 * Under certain circumstances it is possible for the Tavor hardware
1513	 * to transition to one of the error states without software directly
1514	 * knowing about it.  The QueryQP() call is the one place where we
1515	 * have an opportunity to sample and update our view of the QP state.
1516	 */
1517	if (qpc->state == TAVOR_QP_SQERR) {
1518		attr_p->qp_info.qp_state = IBT_STATE_SQE;
1519		qp->qp_state = TAVOR_QP_SQERR;
1520	}
1521	if (qpc->state == TAVOR_QP_ERR) {
1522		attr_p->qp_info.qp_state = IBT_STATE_ERROR;
1523		qp->qp_state = TAVOR_QP_ERR;
1524	}
1525	mutex_exit(&qp->qp_lock);
1526
1527	TAVOR_TNF_EXIT(tavor_qp_query);
1528	return (DDI_SUCCESS);
1529}
1530
1531
1532/*
1533 * tavor_qp_create_qpn()
1534 *    Context: Can be called from interrupt or base context.
1535 */
1536static int
1537tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp, tavor_rsrc_t *qpc)
1538{
1539	tavor_qpn_entry_t	query;
1540	tavor_qpn_entry_t	*entry;
1541	avl_index_t		where;
1542
1543	TAVOR_TNF_ENTER(tavor_qp_create_qpn);
1544
1545	/*
1546	 * Build a query (for the AVL tree lookup) and attempt to find
1547	 * a previously added entry that has a matching QPC index.  If
1548	 * no matching entry is found, then allocate, initialize, and
1549	 * add an entry to the AVL tree.
1550	 * If a matching entry is found, then increment its QPN counter
1551	 * and reference counter.
1552	 */
1553	query.qpn_indx = qpc->tr_indx;
1554	mutex_enter(&state->ts_qpn_avl_lock);
1555	entry = (tavor_qpn_entry_t *)avl_find(&state->ts_qpn_avl,
1556	    &query, &where);
1557	if (entry == NULL) {
1558		/*
1559		 * Allocate and initialize a QPN entry, then insert
1560		 * it into the AVL tree.
1561		 */
1562		entry = (tavor_qpn_entry_t *)kmem_zalloc(
1563		    sizeof (tavor_qpn_entry_t), KM_NOSLEEP);
1564		if (entry == NULL) {
1565			mutex_exit(&state->ts_qpn_avl_lock);
1566			TAVOR_TNF_EXIT(tavor_qp_create_qpn);
1567			return (DDI_FAILURE);
1568		}
1569		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry))
1570
1571		entry->qpn_indx	   = qpc->tr_indx;
1572		entry->qpn_refcnt  = 0;
1573		entry->qpn_counter = 0;
1574
1575		avl_insert(&state->ts_qpn_avl, entry, where);
1576	}
1577
1578	/*
1579	 * Make the AVL tree entry point to the QP context resource that
1580	 * it will be responsible for tracking
1581	 */
1582	entry->qpn_qpc = qpc;
1583
1584	/*
1585	 * Setup the QP handle to point to the AVL tree entry.  Then
1586	 * generate the new QP number from the entry's QPN counter value
1587	 * and the hardware's QP context table index.
1588	 */
1589	qp->qp_qpn_hdl	= entry;
1590	qp->qp_qpnum	= ((entry->qpn_counter <<
1591	    state->ts_cfg_profile->cp_log_num_qp) | qpc->tr_indx) &
1592	    TAVOR_QP_MAXNUMBER_MSK;
1593
1594	/*
1595	 * Increment the reference counter and QPN counter.  The QPN
1596	 * counter always indicates the next available number for use.
1597	 */
1598	entry->qpn_counter++;
1599	entry->qpn_refcnt++;
1600
1601	mutex_exit(&state->ts_qpn_avl_lock);
1602	TAVOR_TNF_EXIT(tavor_qp_create_qpn);
1603	return (DDI_SUCCESS);
1604}
1605
1606
1607/*
1608 * tavor_qp_release_qpn()
1609 *    Context: Can be called only from user or kernel context.
1610 */
1611void
1612tavor_qp_release_qpn(tavor_state_t *state, tavor_qpn_entry_t *entry, int flags)
1613{
1614	TAVOR_TNF_ENTER(tavor_qp_release_qpn);
1615
1616	ASSERT(entry != NULL);
1617
1618	mutex_enter(&state->ts_qpn_avl_lock);
1619
1620	/*
1621	 * If we are releasing the QP number here, then we decrement the
1622	 * reference count and check for zero references.  If there are
1623	 * zero references, then we free the QPC context (if it hadn't
1624	 * already been freed during a TAVOR_QPN_FREE_ONLY free, i.e. for
1625	 * reuse with another similar QP number) and remove the tracking
1626	 * structure from the QP number AVL tree and free the structure.
1627	 * If we are not releasing the QP number here, then, as long as we
1628	 * have not exhausted the usefulness of the QPC context (that is,
1629	 * re-used it too many times without the reference count having
1630	 * gone to zero), we free up the QPC context for use by another
1631	 * thread (which will use it to construct a different QP number
1632	 * from the same QPC table index).
1633	 */
1634	if (flags == TAVOR_QPN_RELEASE) {
1635		entry->qpn_refcnt--;
1636
1637		/*
1638		 * If the reference count is zero, then we free the QPC
1639		 * context (if it hadn't already been freed in an early
1640		 * step, e.g. TAVOR_QPN_FREE_ONLY) and remove/free the
1641		 * tracking structure from the QP number AVL tree.
1642		 */
1643		if (entry->qpn_refcnt == 0) {
1644			if (entry->qpn_qpc != NULL) {
1645				tavor_rsrc_free(state, &entry->qpn_qpc);
1646			}
1647
1648			/*
1649			 * If the current entry has served it's useful
1650			 * purpose (i.e. been reused the maximum allowable
1651			 * number of times), then remove it from QP number
1652			 * AVL tree and free it up.
1653			 */
1654			if (entry->qpn_counter >= (1 <<
1655			    (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1656				avl_remove(&state->ts_qpn_avl, entry);
1657				kmem_free(entry, sizeof (tavor_qpn_entry_t));
1658			}
1659		}
1660
1661	} else if (flags == TAVOR_QPN_FREE_ONLY) {
1662		/*
1663		 * Even if we are not freeing the QP number, that will not
1664		 * always prevent us from releasing the QPC context.  In fact,
1665		 * since the QPC context only forms part of the whole QPN,
1666		 * we want to free it up for use by other consumers.  But
1667		 * if the reference count is non-zero (which it will always
1668		 * be when we are doing TAVOR_QPN_FREE_ONLY) and the counter
1669		 * has reached its maximum value, then we cannot reuse the
1670		 * QPC context until the reference count eventually reaches
1671		 * zero (in TAVOR_QPN_RELEASE, above).
1672		 */
1673		if (entry->qpn_counter < (1 <<
1674		    (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1675			tavor_rsrc_free(state, &entry->qpn_qpc);
1676		}
1677	}
1678	mutex_exit(&state->ts_qpn_avl_lock);
1679
1680	TAVOR_TNF_EXIT(tavor_qp_release_qpn);
1681}
1682
1683
1684/*
1685 * tavor_qpn_db_compare()
1686 *    Context: Can be called from user or kernel context.
1687 */
1688static int
1689tavor_qpn_avl_compare(const void *q, const void *e)
1690{
1691	tavor_qpn_entry_t	*entry, *query;
1692
1693	TAVOR_TNF_ENTER(tavor_qpn_avl_compare);
1694
1695	entry = (tavor_qpn_entry_t *)e;
1696	query = (tavor_qpn_entry_t *)q;
1697
1698	if (query->qpn_indx < entry->qpn_indx) {
1699		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1700		return (-1);
1701	} else if (query->qpn_indx > entry->qpn_indx) {
1702		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1703		return (+1);
1704	} else {
1705		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1706		return (0);
1707	}
1708}
1709
1710
1711/*
1712 * tavor_qpn_avl_init()
1713 *    Context: Only called from attach() path context
1714 */
1715void
1716tavor_qpn_avl_init(tavor_state_t *state)
1717{
1718	TAVOR_TNF_ENTER(tavor_qpn_avl_init);
1719
1720	/* Initialize the lock used for QP number (QPN) AVL tree access */
1721	mutex_init(&state->ts_qpn_avl_lock, NULL, MUTEX_DRIVER,
1722	    DDI_INTR_PRI(state->ts_intrmsi_pri));
1723
1724	/* Initialize the AVL tree for the QP number (QPN) storage */
1725	avl_create(&state->ts_qpn_avl, tavor_qpn_avl_compare,
1726	    sizeof (tavor_qpn_entry_t),
1727	    offsetof(tavor_qpn_entry_t, qpn_avlnode));
1728
1729	TAVOR_TNF_EXIT(tavor_qpn_avl_init);
1730}
1731
1732
1733/*
1734 * tavor_qpn_avl_fini()
1735 *    Context: Only called from attach() and/or detach() path contexts
1736 */
1737void
1738tavor_qpn_avl_fini(tavor_state_t *state)
1739{
1740	tavor_qpn_entry_t	*entry;
1741	void			*cookie;
1742
1743	TAVOR_TNF_ENTER(tavor_qpn_avl_fini);
1744
1745	/*
1746	 * Empty all entries (if necessary) and destroy the AVL tree
1747	 * that was used for QP number (QPN) tracking.
1748	 */
1749	cookie = NULL;
1750	while ((entry = (tavor_qpn_entry_t *)avl_destroy_nodes(
1751	    &state->ts_qpn_avl, &cookie)) != NULL) {
1752		kmem_free(entry, sizeof (tavor_qpn_entry_t));
1753	}
1754	avl_destroy(&state->ts_qpn_avl);
1755
1756	/* Destroy the lock used for QP number (QPN) AVL tree access */
1757	mutex_destroy(&state->ts_qpn_avl_lock);
1758
1759	TAVOR_TNF_EXIT(tavor_qpn_avl_fini);
1760}
1761
1762
1763/*
1764 * tavor_qphdl_from_qpnum()
1765 *    Context: Can be called from interrupt or base context.
1766 *
1767 *    This routine is important because changing the unconstrained
1768 *    portion of the QP number is critical to the detection of a
1769 *    potential race condition in the QP event handler code (i.e. the case
1770 *    where a QP is freed and alloc'd again before an event for the
1771 *    "old" QP can be handled).
1772 *
1773 *    While this is not a perfect solution (not sure that one exists)
1774 *    it does help to mitigate the chance that this race condition will
1775 *    cause us to deliver a "stale" event to the new QP owner.  Note:
1776 *    this solution does not scale well because the number of constrained
1777 *    bits increases (and, hence, the number of unconstrained bits
1778 *    decreases) as the number of supported QPs grows.  For small and
1779 *    intermediate values, it should hopefully provide sufficient
1780 *    protection.
1781 */
1782tavor_qphdl_t
1783tavor_qphdl_from_qpnum(tavor_state_t *state, uint_t qpnum)
1784{
1785	uint_t	qpindx, qpmask;
1786
1787	/* Calculate the QP table index from the qpnum */
1788	qpmask = (1 << state->ts_cfg_profile->cp_log_num_qp) - 1;
1789	qpindx = qpnum & qpmask;
1790	return (state->ts_qphdl[qpindx]);
1791}
1792
1793
1794/*
1795 * tavor_special_qp_rsrc_alloc
1796 *    Context: Can be called from interrupt or base context.
1797 */
1798static int
1799tavor_special_qp_rsrc_alloc(tavor_state_t *state, ibt_sqp_type_t type,
1800    uint_t port, tavor_rsrc_t **qp_rsrc)
1801{
1802	uint_t		mask, flags;
1803	int		status;
1804
1805	TAVOR_TNF_ENTER(tavor_special_qp_rsrc_alloc);
1806
1807	mutex_enter(&state->ts_spec_qplock);
1808	flags = state->ts_spec_qpflags;
1809	if (type == IBT_SMI_SQP) {
1810		/*
1811		 * Check here to see if the driver has been configured
1812		 * to instruct the Tavor firmware to handle all incoming
1813		 * SMP messages (i.e. messages sent to SMA).  If so,
1814		 * then we will treat QP0 as if it has already been
1815		 * allocated (for internal use).  Otherwise, if we allow
1816		 * the allocation to happen, it will cause unexpected
1817		 * behaviors (e.g. Tavor SMA becomes unresponsive).
1818		 */
1819		if (state->ts_cfg_profile->cp_qp0_agents_in_fw != 0) {
1820			mutex_exit(&state->ts_spec_qplock);
1821			TNF_PROBE_0(tavor_special_qp0_alloc_already_in_fw,
1822			    TAVOR_TNF_ERROR, "");
1823			TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1824			return (IBT_QP_IN_USE);
1825		}
1826
1827		/*
1828		 * If this is the first QP0 allocation, then post
1829		 * a CONF_SPECIAL_QP firmware command
1830		 */
1831		if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1832			status = tavor_conf_special_qp_cmd_post(state,
1833			    state->ts_spec_qp0->tr_indx, TAVOR_CMD_QP_SMI,
1834			    TAVOR_CMD_NOSLEEP_SPIN);
1835			if (status != TAVOR_CMD_SUCCESS) {
1836				mutex_exit(&state->ts_spec_qplock);
1837				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1838				    "command failed: %08x\n", status);
1839				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1840				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1841				    status);
1842				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1843				return (IBT_INSUFF_RESOURCE);
1844			}
1845		}
1846
1847		/*
1848		 * Now check (and, if necessary, modify) the flags to indicate
1849		 * whether the allocation was successful
1850		 */
1851		mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1852		if (flags & mask) {
1853			mutex_exit(&state->ts_spec_qplock);
1854			TNF_PROBE_1(tavor_ts_spec_qp0_alloc_already,
1855			    TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1856			TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1857			return (IBT_QP_IN_USE);
1858		}
1859		state->ts_spec_qpflags |= mask;
1860		*qp_rsrc = state->ts_spec_qp0;
1861
1862	} else {
1863		/*
1864		 * If this is the first QP1 allocation, then post
1865		 * a CONF_SPECIAL_QP firmware command
1866		 */
1867		if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1868			status = tavor_conf_special_qp_cmd_post(state,
1869			    state->ts_spec_qp1->tr_indx, TAVOR_CMD_QP_GSI,
1870			    TAVOR_CMD_NOSLEEP_SPIN);
1871			if (status != TAVOR_CMD_SUCCESS) {
1872				mutex_exit(&state->ts_spec_qplock);
1873				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1874				    "command failed: %08x\n", status);
1875				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1876				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1877				    status);
1878				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1879				return (IBT_INSUFF_RESOURCE);
1880			}
1881		}
1882
1883		/*
1884		 * Now check (and, if necessary, modify) the flags to indicate
1885		 * whether the allocation was successful
1886		 */
1887		mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1888		if (flags & mask) {
1889			mutex_exit(&state->ts_spec_qplock);
1890			TNF_PROBE_0(tavor_ts_spec_qp1_alloc_already,
1891			    TAVOR_TNF_ERROR, "");
1892			TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1893			return (IBT_QP_IN_USE);
1894		}
1895		state->ts_spec_qpflags |= mask;
1896		*qp_rsrc = state->ts_spec_qp1;
1897	}
1898
1899	mutex_exit(&state->ts_spec_qplock);
1900	TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1901	return (DDI_SUCCESS);
1902}
1903
1904
1905/*
1906 * tavor_special_qp_rsrc_free
1907 *    Context: Can be called from interrupt or base context.
1908 */
1909static int
1910tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
1911    uint_t port)
1912{
1913	uint_t		mask, flags;
1914	int		status;
1915
1916	TAVOR_TNF_ENTER(tavor_special_qp_rsrc_free);
1917
1918	mutex_enter(&state->ts_spec_qplock);
1919	if (type == IBT_SMI_SQP) {
1920		mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1921		state->ts_spec_qpflags &= ~mask;
1922		flags = state->ts_spec_qpflags;
1923
1924		/*
1925		 * If this is the last QP0 free, then post a CONF_SPECIAL_QP
1926		 * firmware command
1927		 */
1928		if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1929			status = tavor_conf_special_qp_cmd_post(state, 0,
1930			    TAVOR_CMD_QP_SMI, TAVOR_CMD_NOSLEEP_SPIN);
1931			if (status != TAVOR_CMD_SUCCESS) {
1932				mutex_exit(&state->ts_spec_qplock);
1933				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1934				    "command failed: %08x\n", status);
1935				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1936				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1937				    status);
1938				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1939				return (ibc_get_ci_failure(0));
1940			}
1941		}
1942	} else {
1943		mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1944		state->ts_spec_qpflags &= ~mask;
1945		flags = state->ts_spec_qpflags;
1946
1947		/*
1948		 * If this is the last QP1 free, then post a CONF_SPECIAL_QP
1949		 * firmware command
1950		 */
1951		if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1952			status = tavor_conf_special_qp_cmd_post(state, 0,
1953			    TAVOR_CMD_QP_GSI, TAVOR_CMD_NOSLEEP_SPIN);
1954			if (status != TAVOR_CMD_SUCCESS) {
1955				mutex_exit(&state->ts_spec_qplock);
1956				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1957				    "command failed: %08x\n", status);
1958				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1959				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1960				    status);
1961				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1962				return (ibc_get_ci_failure(0));
1963			}
1964		}
1965	}
1966
1967	mutex_exit(&state->ts_spec_qplock);
1968	TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1969	return (DDI_SUCCESS);
1970}
1971
1972
1973/*
1974 * tavor_qp_sgl_to_logwqesz()
1975 *    Context: Can be called from interrupt or base context.
1976 */
1977static void
1978tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1979    tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1980{
1981	uint_t	max_size, log2, actual_sgl;
1982
1983	TAVOR_TNF_ENTER(tavor_qp_sgl_to_logwqesz);
1984
1985	switch (wq_type) {
1986	case TAVOR_QP_WQ_TYPE_SENDQ:
1987		/*
1988		 * Use requested maximum SGL to calculate max descriptor size
1989		 * (while guaranteeing that the descriptor size is a
1990		 * power-of-2 cachelines).
1991		 */
1992		max_size = (TAVOR_QP_WQE_MLX_SND_HDRS + (num_sgl << 4));
1993		log2 = highbit(max_size);
1994		if ((max_size & (max_size - 1)) == 0) {
1995			log2 = log2 - 1;
1996		}
1997
1998		/* Make sure descriptor is at least the minimum size */
1999		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2000
2001		/* Calculate actual number of SGL (given WQE size) */
2002		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_SND_HDRS) >> 4;
2003		break;
2004
2005	case TAVOR_QP_WQ_TYPE_RECVQ:
2006		/*
2007		 * Same as above (except for Recv WQEs)
2008		 */
2009		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
2010		log2 = highbit(max_size);
2011		if ((max_size & (max_size - 1)) == 0) {
2012			log2 = log2 - 1;
2013		}
2014
2015		/* Make sure descriptor is at least the minimum size */
2016		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2017
2018		/* Calculate actual number of SGL (given WQE size) */
2019		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
2020		break;
2021
2022	case TAVOR_QP_WQ_TYPE_SENDMLX_QP0:
2023		/*
2024		 * Same as above (except for MLX transport WQEs).  For these
2025		 * WQEs we have to account for the space consumed by the
2026		 * "inline" packet headers.  (This is smaller than for QP1
2027		 * below because QP0 is not allowed to send packets with a GRH.
2028		 */
2029		max_size = (TAVOR_QP_WQE_MLX_QP0_HDRS + (num_sgl << 4));
2030		log2 = highbit(max_size);
2031		if ((max_size & (max_size - 1)) == 0) {
2032			log2 = log2 - 1;
2033		}
2034
2035		/* Make sure descriptor is at least the minimum size */
2036		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2037
2038		/* Calculate actual number of SGL (given WQE size) */
2039		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP0_HDRS) >> 4;
2040		break;
2041
2042	case TAVOR_QP_WQ_TYPE_SENDMLX_QP1:
2043		/*
2044		 * Same as above.  For these WQEs we again have to account for
2045		 * the space consumed by the "inline" packet headers.  (This
2046		 * is larger than for QP0 above because we have to account for
2047		 * the possibility of a GRH in each packet - and this
2048		 * introduces an alignment issue that causes us to consume
2049		 * an additional 8 bytes).
2050		 */
2051		max_size = (TAVOR_QP_WQE_MLX_QP1_HDRS + (num_sgl << 4));
2052		log2 = highbit(max_size);
2053		if ((max_size & (max_size - 1)) == 0) {
2054			log2 = log2 - 1;
2055		}
2056
2057		/* Make sure descriptor is at least the minimum size */
2058		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2059
2060		/* Calculate actual number of SGL (given WQE size) */
2061		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP1_HDRS) >> 4;
2062		break;
2063
2064	default:
2065		TAVOR_WARNING(state, "unexpected work queue type");
2066		TNF_PROBE_0(tavor_qp_sgl_to_logwqesz_inv_wqtype_fail,
2067		    TAVOR_TNF_ERROR, "");
2068		break;
2069	}
2070
2071	/* Fill in the return values */
2072	*logwqesz = log2;
2073	*max_sgl  = min(state->ts_cfg_profile->cp_wqe_real_max_sgl, actual_sgl);
2074
2075	TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
2076}
2077