1219820SjeffXRC implementation, consolidated (version 2):
2219820Sjeff
3219820Sjeffxrc ops were moved to their own structure at the end of
4219820Sjeffstruct ibv_context (to preserve binary compatibility).
5219820Sjeff
6219820SjeffCheck for ibv_context.xrc_ops member via AC_CHECK_MEMBER
7219820Sjeff
8219820SjeffXRC QPs have MSB set in qp number, for identification in
9219820Sjeffcompletion handling.
10219820Sjeff
11219820SjeffVarious bug fixes.
12219820Sjeff(OFED 1.3 commit 39fe7f47e8fc07f356098df048d00740ba585fc5)
13219820Sjeff
14219820SjeffSigned-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
15219820Sjeff---
16219820SjeffV2:
17219820Sjeff1. checkpatch.pl cleanup
18219820Sjeff2. Changed xrc_ops to more ops
19219820Sjeff3. Check for xrc verbs in ibv_more_ops via AC_CHECK_MEMBER
20219820Sjeff
21219820Sjeffdiff --git a/configure.in b/configure.in
22219820Sjeffindex 25f27f7..46a3a64 100644
23219820Sjeff--- a/configure.in
24219820Sjeff+++ b/configure.in
25219820Sjeff@@ -42,6 +42,12 @@ AC_CHECK_HEADER(valgrind/memcheck.h,
26219820Sjeff dnl Checks for typedefs, structures, and compiler characteristics.
27219820Sjeff AC_C_CONST
28219820Sjeff AC_CHECK_SIZEOF(long)
29219820Sjeff+AC_CHECK_MEMBER(struct ibv_context.more_ops,
30219820Sjeff+    [AC_DEFINE([HAVE_IBV_MORE_OPS], 1, [Define to 1 if more_ops is a member of ibv_context])],,
31219820Sjeff+    [#include <infiniband/verbs.h>])
32219820Sjeff+AC_CHECK_MEMBER(struct ibv_more_ops.create_xrc_srq,
33219820Sjeff+    [AC_DEFINE([HAVE_IBV_XRC_OPS], 1, [Define to 1 if have xrc ops])],,
34219820Sjeff+    [#include <infiniband/verbs.h>])
35219820Sjeff 
36219820Sjeff dnl Checks for library functions
37219820Sjeff AC_CHECK_FUNC(ibv_read_sysfs_file, [],
38219820Sjeffdiff --git a/src/cq.c b/src/cq.c
39219820Sjeffindex 68e16e9..c598b87 100644
40219820Sjeff--- a/src/cq.c
41219820Sjeff+++ b/src/cq.c
42219820Sjeff@@ -194,8 +194,9 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
43219820Sjeff {
44219820Sjeff 	struct mlx4_wq *wq;
45219820Sjeff 	struct mlx4_cqe *cqe;
46219820Sjeff-	struct mlx4_srq *srq;
47219820Sjeff+	struct mlx4_srq *srq = NULL;
48219820Sjeff 	uint32_t qpn;
49219820Sjeff+	uint32_t srqn;
50219820Sjeff 	uint32_t g_mlpath_rqpn;
51219820Sjeff 	uint16_t wqe_index;
52219820Sjeff 	int is_error;
53219820Sjeff@@ -221,20 +223,29 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
54219820Sjeff 	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
55219820Sjeff 		MLX4_CQE_OPCODE_ERROR;
56219820Sjeff 
57219820Sjeff-	if (!*cur_qp ||
58219820Sjeff-	    (ntohl(cqe->my_qpn) & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
59219820Sjeff+	if (qpn & MLX4_XRC_QPN_BIT && !is_send) {
60219820Sjeff+		srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff;
61219820Sjeff+		/*
62219820Sjeff+		 * We do not have to take the XRC SRQ table lock here,
63219820Sjeff+		 * because CQs will be locked while XRC SRQs are removed
64219820Sjeff+		 * from the table.
65219820Sjeff+		 */
66219820Sjeff+		srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn);
67219820Sjeff+		if (!srq)
68219820Sjeff+			return CQ_POLL_ERR;
69219820Sjeff+	} else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
70219820Sjeff 		/*
71219820Sjeff 		 * We do not have to take the QP table lock here,
72219820Sjeff 		 * because CQs will be locked while QPs are removed
73219820Sjeff 		 * from the table.
74219820Sjeff 		 */
75219820Sjeff 		*cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
76219820Sjeff-				       ntohl(cqe->my_qpn) & 0xffffff);
77219820Sjeff+				       qpn & 0xffffff);
78219820Sjeff 		if (!*cur_qp)
79219820Sjeff 			return CQ_POLL_ERR;
80219820Sjeff 	}
81219820Sjeff 
82219820Sjeff-	wc->qp_num = (*cur_qp)->ibv_qp.qp_num;
83219820Sjeff+	wc->qp_num = qpn & 0xffffff;
84219820Sjeff 
85219820Sjeff 	if (is_send) {
86219820Sjeff 		wq = &(*cur_qp)->sq;
87219820Sjeff@@ -242,6 +254,10 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
88219820Sjeff 		wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
89219820Sjeff 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
90219820Sjeff 		++wq->tail;
91219820Sjeff+	} else if (srq) {
92219820Sjeff+		wqe_index = htons(cqe->wqe_index);
93219820Sjeff+		wc->wr_id = srq->wrid[wqe_index];
94219820Sjeff+		mlx4_free_srq_wqe(srq, wqe_index);
95219820Sjeff 	} else if ((*cur_qp)->ibv_qp.srq) {
96219820Sjeff 		srq = to_msrq((*cur_qp)->ibv_qp.srq);
97219820Sjeff 		wqe_index = htons(cqe->wqe_index);
98219820Sjeff@@ -387,6 +403,10 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
99219820Sjeff 	uint32_t prod_index;
100219820Sjeff 	uint8_t owner_bit;
101219820Sjeff 	int nfreed = 0;
102219820Sjeff+	int is_xrc_srq = 0;
103219820Sjeff+
104219820Sjeff+	if (srq && srq->ibv_srq.xrc_cq)
105219820Sjeff+		is_xrc_srq = 1;
106219820Sjeff 
107219820Sjeff 	/*
108219820Sjeff 	 * First we need to find the current producer index, so we
109219820Sjeff@@ -405,7 +425,12 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
110219820Sjeff 	 */
111219820Sjeff 	while ((int) --prod_index - (int) cq->cons_index >= 0) {
112219820Sjeff 		cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
113219820Sjeff-		if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
114219820Sjeff+		if (is_xrc_srq &&
115219820Sjeff+		    (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
116219820Sjeff+		    !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
117219820Sjeff+			mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
118219820Sjeff+			++nfreed;
119219820Sjeff+		} else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
120219820Sjeff 			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
121219820Sjeff 				mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
122219820Sjeff 			++nfreed;
123219820Sjeffdiff --git a/src/mlx4-abi.h b/src/mlx4-abi.h
124219820Sjeffindex 20a40c9..1b1253c 100644
125219820Sjeff--- a/src/mlx4-abi.h
126219820Sjeff+++ b/src/mlx4-abi.h
127219820Sjeff@@ -68,6 +68,14 @@ struct mlx4_resize_cq {
128219820Sjeff 	__u64				buf_addr;
129219820Sjeff };
130219820Sjeff 
131219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
132219820Sjeff+struct mlx4_create_xrc_srq {
133219820Sjeff+	struct ibv_create_xrc_srq	ibv_cmd;
134219820Sjeff+	__u64				buf_addr;
135219820Sjeff+	__u64				db_addr;
136219820Sjeff+};
137219820Sjeff+#endif
138219820Sjeff+
139219820Sjeff struct mlx4_create_srq {
140219820Sjeff 	struct ibv_create_srq		ibv_cmd;
141219820Sjeff 	__u64				buf_addr;
142219820Sjeff@@ -90,4 +98,12 @@ struct mlx4_create_qp {
143219820Sjeff 	__u8				reserved[5];
144219820Sjeff };
145219820Sjeff 
146219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
147219820Sjeff+struct mlx4_open_xrc_domain_resp {
148219820Sjeff+	struct ibv_open_xrc_domain_resp	ibv_resp;
149219820Sjeff+	__u32				xrcdn;
150219820Sjeff+	__u32				reserved;
151219820Sjeff+};
152219820Sjeff+#endif
153219820Sjeff+
154219820Sjeff #endif /* MLX4_ABI_H */
155219820Sjeffdiff --git a/src/mlx4.c b/src/mlx4.c
156219820Sjeffindex 671e849..27ca75d 100644
157219820Sjeff--- a/src/mlx4.c
158219820Sjeff+++ b/src/mlx4.c
159219820Sjeff@@ -68,6 +68,16 @@ struct {
160219820Sjeff 	HCA(MELLANOX, 0x673c),	/* MT25408 "Hermon" QDR PCIe gen2 */
161219820Sjeff };
162219820Sjeff 
163219820Sjeff+#ifdef HAVE_IBV_MORE_OPS
164219820Sjeff+static struct ibv_more_ops mlx4_more_ops = {
165219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
166219820Sjeff+	.create_xrc_srq   = mlx4_create_xrc_srq,
167219820Sjeff+	.open_xrc_domain  = mlx4_open_xrc_domain,
168219820Sjeff+	.close_xrc_domain = mlx4_close_xrc_domain,
169219820Sjeff+#endif
170219820Sjeff+};
171219820Sjeff+#endif
172219820Sjeff+
173219820Sjeff static struct ibv_context_ops mlx4_ctx_ops = {
174219820Sjeff 	.query_device  = mlx4_query_device,
175219820Sjeff 	.query_port    = mlx4_query_port,
176219820Sjeff@@ -124,6 +134,15 @@ static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_
177219820Sjeff 	for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
178219820Sjeff 		context->qp_table[i].refcnt = 0;
179219820Sjeff 
180219820Sjeff+	context->num_xrc_srqs = resp.qp_tab_size;
181219820Sjeff+	context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1
182219820Sjeff+				       - MLX4_XRC_SRQ_TABLE_BITS;
183219820Sjeff+	context->xrc_srq_table_mask = (1 << context->xrc_srq_table_shift) - 1;
184219820Sjeff+
185219820Sjeff+	pthread_mutex_init(&context->xrc_srq_table_mutex, NULL);
186219820Sjeff+	for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i)
187219820Sjeff+		context->xrc_srq_table[i].refcnt = 0;
188219820Sjeff+
189219820Sjeff 	for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
190219820Sjeff 		context->db_list[i] = NULL;
191219820Sjeff 
192219820Sjeff@@ -156,6 +175,9 @@ static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_
193219820Sjeff 	pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
194219820Sjeff 
195219820Sjeff 	context->ibv_ctx.ops = mlx4_ctx_ops;
196219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
197219820Sjeff+	context->ibv_ctx.more_ops = &mlx4_more_ops;
198219820Sjeff+#endif
199219820Sjeff 
200219820Sjeff 	if (mlx4_query_device(&context->ibv_ctx, &dev_attrs))
201219820Sjeff 		goto query_free;
202219820Sjeffdiff --git a/src/mlx4.h b/src/mlx4.h
203219820Sjeffindex 8643d8f..3eadb98 100644
204219820Sjeff--- a/src/mlx4.h
205219820Sjeff+++ b/src/mlx4.h
206219820Sjeff@@ -79,6 +79,11 @@
207219820Sjeff 
208219820Sjeff #endif
209219820Sjeff 
210219820Sjeff+#ifndef HAVE_IBV_MORE_OPS
211219820Sjeff+#undef HAVE_IBV_XRC_OPS
212219820Sjeff+#undef HAVE_IBV_CREATE_QP_EXP
213219820Sjeff+#endif
214219820Sjeff+
215219820Sjeff #define HIDDEN		__attribute__((visibility ("hidden")))
216219820Sjeff 
217219820Sjeff #define PFX		"mlx4: "
218219820Sjeff@@ -111,6 +116,16 @@ enum {
219219820Sjeff 	MLX4_QP_TABLE_MASK		= MLX4_QP_TABLE_SIZE - 1
220219820Sjeff };
221219820Sjeff 
222219820Sjeff+enum {
223219820Sjeff+	MLX4_XRC_SRQ_TABLE_BITS		= 8,
224219820Sjeff+	MLX4_XRC_SRQ_TABLE_SIZE		= 1 << MLX4_XRC_SRQ_TABLE_BITS,
225219820Sjeff+	MLX4_XRC_SRQ_TABLE_MASK		= MLX4_XRC_SRQ_TABLE_SIZE - 1
226219820Sjeff+};
227219820Sjeff+
228219820Sjeff+enum {
229219820Sjeff+	MLX4_XRC_QPN_BIT		= (1 << 23)
230219820Sjeff+};
231219820Sjeff+
232219820Sjeff enum mlx4_db_type {
233219820Sjeff 	MLX4_DB_TYPE_CQ,
234219820Sjeff 	MLX4_DB_TYPE_RQ,
235219820Sjeff@@ -174,6 +189,15 @@ struct mlx4_context {
236219820Sjeff 	int				max_sge;
237219820Sjeff 	int				max_cqe;
238219820Sjeff 
239219820Sjeff+	struct {
240219820Sjeff+		struct mlx4_srq       **table;
241219820Sjeff+		int			refcnt;
242219820Sjeff+	}				xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE];
243219820Sjeff+	pthread_mutex_t			xrc_srq_table_mutex;
244219820Sjeff+	int				num_xrc_srqs;
245219820Sjeff+	int				xrc_srq_table_shift;
246219820Sjeff+	int				xrc_srq_table_mask;
247219820Sjeff+
248219820Sjeff 	struct mlx4_db_page	       *db_list[MLX4_NUM_DB_TYPE];
249219820Sjeff 	pthread_mutex_t			db_list_mutex;
250219820Sjeff };
251219820Sjeff@@ -260,6 +284,11 @@ struct mlx4_ah {
252219820Sjeff 	struct mlx4_av			av;
253219820Sjeff };
254219820Sjeff 
255219820Sjeff+struct mlx4_xrc_domain {
256219820Sjeff+	struct ibv_xrc_domain		ibv_xrcd;
257219820Sjeff+	uint32_t			xrcdn;
258219820Sjeff+};
259219820Sjeff+
260219820Sjeff static inline unsigned long align(unsigned long val, unsigned long align)
261219820Sjeff {
262219820Sjeff 	return (val + align - 1) & ~(align - 1);
263219820Sjeff@@ -304,6 +333,13 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
264219820Sjeff 	return to_mxxx(ah, ah);
265219820Sjeff }
266219820Sjeff 
267219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
268219820Sjeff+static inline struct mlx4_xrc_domain *to_mxrcd(struct ibv_xrc_domain *ibxrcd)
269219820Sjeff+{
270219820Sjeff+	return to_mxxx(xrcd, xrc_domain);
271219820Sjeff+}
272219820Sjeff+#endif
273219820Sjeff+
274219820Sjeff int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
275219820Sjeff void mlx4_free_buf(struct mlx4_buf *buf);
276219820Sjeff 
277219820Sjeff@@ -350,6 +386,10 @@ void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
278219820Sjeff int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
279219820Sjeff 		       struct ibv_recv_wr *wr,
280219820Sjeff 		       struct ibv_recv_wr **bad_wr);
281219820Sjeff+struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
282219820Sjeff+int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
283219820Sjeff+		       struct mlx4_srq *srq);
284219820Sjeff+void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
285219820Sjeff 
286219820Sjeff struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
287219820Sjeff int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
288219820Sjeff@@ -380,5 +420,16 @@ int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
289219820Sjeff int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
290219820Sjeff 		   struct mlx4_ah *ah);
291219820Sjeff void mlx4_free_av(struct mlx4_ah *ah);
292219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
293219820Sjeff+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
294219820Sjeff+				    struct ibv_xrc_domain *xrc_domain,
295219820Sjeff+				    struct ibv_cq *xrc_cq,
296219820Sjeff+				    struct ibv_srq_init_attr *attr);
297219820Sjeff+struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
298219820Sjeff+					    int fd, int oflag);
299219820Sjeff+
300219820Sjeff+int mlx4_close_xrc_domain(struct ibv_xrc_domain *d);
301219820Sjeff+#endif
302219820Sjeff+
303219820Sjeff 
304219820Sjeff #endif /* MLX4_H */
305219820Sjeffdiff --git a/src/qp.c b/src/qp.c
306219820Sjeffindex 01e8580..2f02430 100644
307219820Sjeff--- a/src/qp.c
308219820Sjeff+++ b/src/qp.c
309219820Sjeff@@ -226,7 +226,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
310219820Sjeff 		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
311219820Sjeff 		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
312219820Sjeff 
313219820Sjeff-		ctrl->srcrb_flags =
314219820Sjeff+		ctrl->xrcrb_flags =
315219820Sjeff 			(wr->send_flags & IBV_SEND_SIGNALED ?
316219820Sjeff 			 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
317219820Sjeff 			(wr->send_flags & IBV_SEND_SOLICITED ?
318219820Sjeff@@ -243,6 +243,9 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
319219820Sjeff 		size = sizeof *ctrl / 16;
320219820Sjeff 
321219820Sjeff 		switch (ibqp->qp_type) {
322219820Sjeff+		case IBV_QPT_XRC:
323219820Sjeff+			ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
324219820Sjeff+			/* fall thru */
325219820Sjeff 		case IBV_QPT_RC:
326219820Sjeff 		case IBV_QPT_UC:
327219820Sjeff 			switch (wr->opcode) {
328219820Sjeff@@ -543,6 +546,7 @@ void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
329219820Sjeff 		size += sizeof (struct mlx4_wqe_raddr_seg);
330219820Sjeff 		break;
331219820Sjeff 
332219820Sjeff+	case IBV_QPT_XRC:
333219820Sjeff 	case IBV_QPT_RC:
334219820Sjeff 		size += sizeof (struct mlx4_wqe_raddr_seg);
335219820Sjeff 		/*
336219820Sjeff@@ -631,6 +635,7 @@ void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
337219820Sjeff 
338219820Sjeff 	case IBV_QPT_UC:
339219820Sjeff 	case IBV_QPT_RC:
340219820Sjeff+	case IBV_QPT_XRC:
341219820Sjeff 		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
342219820Sjeff 		break;
343219820Sjeff 
344219820Sjeffdiff --git a/src/srq.c b/src/srq.c
345219820Sjeffindex ba2ceb9..1350792 100644
346219820Sjeff--- a/src/srq.c
347219820Sjeff+++ b/src/srq.c
348219820Sjeff@@ -167,3 +167,53 @@ int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
349219820Sjeff 
350219820Sjeff 	return 0;
351219820Sjeff }
352219820Sjeff+
353219820Sjeff+struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
354219820Sjeff+{
355219820Sjeff+	int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
356219820Sjeff+
357219820Sjeff+	if (ctx->xrc_srq_table[tind].refcnt)
358219820Sjeff+		return ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask];
359219820Sjeff+	else
360219820Sjeff+		return NULL;
361219820Sjeff+}
362219820Sjeff+
363219820Sjeff+int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
364219820Sjeff+		       struct mlx4_srq *srq)
365219820Sjeff+{
366219820Sjeff+	int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
367219820Sjeff+	int ret = 0;
368219820Sjeff+
369219820Sjeff+	pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
370219820Sjeff+
371219820Sjeff+	if (!ctx->xrc_srq_table[tind].refcnt) {
372219820Sjeff+		ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask + 1,
373219820Sjeff+							sizeof(struct mlx4_srq *));
374219820Sjeff+		if (!ctx->xrc_srq_table[tind].table) {
375219820Sjeff+			ret = -1;
376219820Sjeff+			goto out;
377219820Sjeff+		}
378219820Sjeff+	}
379219820Sjeff+
380219820Sjeff+	++ctx->xrc_srq_table[tind].refcnt;
381219820Sjeff+	ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = srq;
382219820Sjeff+
383219820Sjeff+out:
384219820Sjeff+	pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
385219820Sjeff+	return ret;
386219820Sjeff+}
387219820Sjeff+
388219820Sjeff+void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
389219820Sjeff+{
390219820Sjeff+	int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
391219820Sjeff+
392219820Sjeff+	pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
393219820Sjeff+
394219820Sjeff+	if (!--ctx->xrc_srq_table[tind].refcnt)
395219820Sjeff+		free(ctx->xrc_srq_table[tind].table);
396219820Sjeff+	else
397219820Sjeff+		ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = NULL;
398219820Sjeff+
399219820Sjeff+	pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
400219820Sjeff+}
401219820Sjeff+
402219820Sjeffdiff --git a/src/verbs.c b/src/verbs.c
403219820Sjeffindex 400050c..b7c9c8e 100644
404219820Sjeff--- a/src/verbs.c
405219820Sjeff+++ b/src/verbs.c
406219820Sjeff@@ -368,18 +368,36 @@ int mlx4_query_srq(struct ibv_srq *srq,
407219820Sjeff 	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
408219820Sjeff }
409219820Sjeff 
410219820Sjeff-int mlx4_destroy_srq(struct ibv_srq *srq)
411219820Sjeff+int mlx4_destroy_srq(struct ibv_srq *ibsrq)
412219820Sjeff {
413219820Sjeff+	struct mlx4_srq *srq = to_msrq(ibsrq);
414219820Sjeff+	struct mlx4_cq *mcq = NULL;
415219820Sjeff 	int ret;
416219820Sjeff 
417219820Sjeff-	ret = ibv_cmd_destroy_srq(srq);
418219820Sjeff-	if (ret)
419219820Sjeff+	if (ibsrq->xrc_cq) {
420219820Sjeff+		/* is an xrc_srq */
421219820Sjeff+		mcq = to_mcq(ibsrq->xrc_cq);
422219820Sjeff+		mlx4_cq_clean(mcq, 0, srq);
423219820Sjeff+		pthread_spin_lock(&mcq->lock);
424219820Sjeff+		mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn);
425219820Sjeff+		pthread_spin_unlock(&mcq->lock);
426219820Sjeff+	}
427219820Sjeff+
428219820Sjeff+	ret = ibv_cmd_destroy_srq(ibsrq);
429219820Sjeff+	if (ret) {
430219820Sjeff+		if (ibsrq->xrc_cq) {
431219820Sjeff+			pthread_spin_lock(&mcq->lock);
432219820Sjeff+			mlx4_store_xrc_srq(to_mctx(ibsrq->context),
433219820Sjeff+					   srq->srqn, srq);
434219820Sjeff+			pthread_spin_unlock(&mcq->lock);
435219820Sjeff+		}
436219820Sjeff 		return ret;
437219820Sjeff+	}
438219820Sjeff 
439219820Sjeff-	mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
440219820Sjeff-	mlx4_free_buf(&to_msrq(srq)->buf);
441219820Sjeff-	free(to_msrq(srq)->wrid);
442219820Sjeff-	free(to_msrq(srq));
443219820Sjeff+	mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db);
444219820Sjeff+	mlx4_free_buf(&srq->buf);
445219820Sjeff+	free(srq->wrid);
446219820Sjeff+	free(srq);
447219820Sjeff 
448219820Sjeff 	return 0;
449219820Sjeff }
450219820Sjeff@@ -415,7 +433,7 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
451219820Sjeff 	qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
452219820Sjeff 	qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
453219820Sjeff 
454219820Sjeff-	if (attr->srq)
455219820Sjeff+	if (attr->srq || attr->qp_type == IBV_QPT_XRC)
456219820Sjeff 		attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
457219820Sjeff 	else {
458219820Sjeff 		if (attr->cap.max_recv_sge < 1)
459219820Sjeff@@ -433,7 +451,7 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
460219820Sjeff 	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
461219820Sjeff 		goto err_free;
462219820Sjeff 
463219820Sjeff-	if (!attr->srq) {
464219820Sjeff+	if (!attr->srq && attr->qp_type != IBV_QPT_XRC) {
465219820Sjeff 		qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
466219820Sjeff 		if (!qp->db)
467219820Sjeff 			goto err_free;
468219820Sjeff@@ -442,7 +460,7 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
469219820Sjeff 	}
470219820Sjeff 
471219820Sjeff 	cmd.buf_addr	    = (uintptr_t) qp->buf.buf;
472219820Sjeff-	if (attr->srq)
473219820Sjeff+	if (attr->srq || attr->qp_type == IBV_QPT_XRC)
474219820Sjeff 		cmd.db_addr = 0;
475219820Sjeff 	else
476219820Sjeff 		cmd.db_addr = (uintptr_t) qp->db;
477219820Sjeff@@ -485,7 +503,7 @@ err_destroy:
478219820Sjeff 
479219820Sjeff err_rq_db:
480219820Sjeff 	pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
481219820Sjeff-	if (!attr->srq)
482219820Sjeff+	if (!attr->srq && attr->qp_type != IBV_QPT_XRC)
483219820Sjeff 		mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
484219820Sjeff 
485219820Sjeff err_free:
486219820Sjeff@@ -544,7 +562,7 @@ int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
487219820Sjeff 			mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
488219820Sjeff 
489219820Sjeff 		mlx4_init_qp_indices(to_mqp(qp));
490219820Sjeff-		if (!qp->srq)
491219820Sjeff+		if (!qp->srq && qp->qp_type != IBV_QPT_XRC)
492219820Sjeff 			*to_mqp(qp)->db = 0;
493219820Sjeff 	}
494219820Sjeff 
495219820Sjeff@@ -603,7 +621,7 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
496219820Sjeff 	mlx4_unlock_cqs(ibqp);
497219820Sjeff 	pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
498219820Sjeff 
499219820Sjeff-	if (!ibqp->srq)
500219820Sjeff+	if (!ibqp->srq && ibqp->qp_type != IBV_QPT_XRC)
501219820Sjeff 		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
502219820Sjeff 	free(qp->sq.wrid);
503219820Sjeff 	if (qp->rq.wqe_cnt)
504219820Sjeff@@ -661,3 +679,103 @@ int mlx4_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid)
505219820Sjeff 
506219820Sjeff 	return 0;
507219820Sjeff }
508219820Sjeff+
509219820Sjeff+#ifdef HAVE_IBV_XRC_OPS
510219820Sjeff+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
511219820Sjeff+				    struct ibv_xrc_domain *xrc_domain,
512219820Sjeff+				    struct ibv_cq *xrc_cq,
513219820Sjeff+				    struct ibv_srq_init_attr *attr)
514219820Sjeff+{
515219820Sjeff+	struct mlx4_create_xrc_srq  cmd;
516219820Sjeff+	struct mlx4_create_srq_resp resp;
517219820Sjeff+	struct mlx4_srq		   *srq;
518219820Sjeff+	int			    ret;
519219820Sjeff+
520219820Sjeff+	/* Sanity check SRQ size before proceeding */
521219820Sjeff+	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
522219820Sjeff+		return NULL;
523219820Sjeff+
524219820Sjeff+	srq = malloc(sizeof *srq);
525219820Sjeff+	if (!srq)
526219820Sjeff+		return NULL;
527219820Sjeff+
528219820Sjeff+	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
529219820Sjeff+		goto err;
530219820Sjeff+
531219820Sjeff+	srq->max     = align_queue_size(attr->attr.max_wr + 1);
532219820Sjeff+	srq->max_gs  = attr->attr.max_sge;
533219820Sjeff+	srq->counter = 0;
534219820Sjeff+
535219820Sjeff+	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
536219820Sjeff+		goto err;
537219820Sjeff+
538219820Sjeff+	srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
539219820Sjeff+	if (!srq->db)
540219820Sjeff+		goto err_free;
541219820Sjeff+
542219820Sjeff+	*srq->db = 0;
543219820Sjeff+
544219820Sjeff+	cmd.buf_addr = (uintptr_t) srq->buf.buf;
545219820Sjeff+	cmd.db_addr  = (uintptr_t) srq->db;
546219820Sjeff+
547219820Sjeff+	ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr,
548219820Sjeff+				     xrc_domain->handle,
549219820Sjeff+				     xrc_cq->handle,
550219820Sjeff+				     &cmd.ibv_cmd, sizeof cmd,
551219820Sjeff+				     &resp.ibv_resp, sizeof resp);
552219820Sjeff+	if (ret)
553219820Sjeff+		goto err_db;
554219820Sjeff+
555219820Sjeff+	srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn;
556219820Sjeff+
557219820Sjeff+	ret = mlx4_store_xrc_srq(to_mctx(pd->context), srq->ibv_srq.xrc_srq_num, srq);
558219820Sjeff+	if (ret)
559219820Sjeff+		goto err_destroy;
560219820Sjeff+
561219820Sjeff+	return &srq->ibv_srq;
562219820Sjeff+
563219820Sjeff+err_destroy:
564219820Sjeff+	ibv_cmd_destroy_srq(&srq->ibv_srq);
565219820Sjeff+
566219820Sjeff+err_db:
567219820Sjeff+	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
568219820Sjeff+
569219820Sjeff+err_free:
570219820Sjeff+	free(srq->wrid);
571219820Sjeff+	mlx4_free_buf(&srq->buf);
572219820Sjeff+
573219820Sjeff+err:
574219820Sjeff+	free(srq);
575219820Sjeff+
576219820Sjeff+	return NULL;
577219820Sjeff+}
578219820Sjeff+
579219820Sjeff+struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
580219820Sjeff+					    int fd, int oflag)
581219820Sjeff+{
582219820Sjeff+	int ret;
583219820Sjeff+	struct mlx4_open_xrc_domain_resp resp;
584219820Sjeff+	struct mlx4_xrc_domain *xrcd;
585219820Sjeff+
586219820Sjeff+	xrcd = malloc(sizeof *xrcd);
587219820Sjeff+	if (!xrcd)
588219820Sjeff+		return NULL;
589219820Sjeff+
590219820Sjeff+	ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd,
591219820Sjeff+				      &resp.ibv_resp, sizeof resp);
592219820Sjeff+	if (ret) {
593219820Sjeff+		free(xrcd);
594219820Sjeff+		return NULL;
595219820Sjeff+	}
596219820Sjeff+
597219820Sjeff+	xrcd->xrcdn = resp.xrcdn;
598219820Sjeff+	return &xrcd->ibv_xrcd;
599219820Sjeff+}
600219820Sjeff+
601219820Sjeff+int mlx4_close_xrc_domain(struct ibv_xrc_domain *d)
602219820Sjeff+{
603219820Sjeff+	ibv_cmd_close_xrc_domain(d);
604219820Sjeff+	free(d);
605219820Sjeff+	return 0;
606219820Sjeff+}
607219820Sjeff+#endif
608219820Sjeffdiff --git a/src/wqe.h b/src/wqe.h
609219820Sjeffindex 6f7f309..fa2f8ac 100644
610219820Sjeff--- a/src/wqe.h
611219820Sjeff+++ b/src/wqe.h
612219820Sjeff@@ -65,7 +65,7 @@ struct mlx4_wqe_ctrl_seg {
613219820Sjeff 	 * [1]   SE (solicited event)
614219820Sjeff 	 * [0]   FL (force loopback)
615219820Sjeff 	 */
616219820Sjeff-	uint32_t		srcrb_flags;
617219820Sjeff+	uint32_t		xrcrb_flags;
618219820Sjeff 	/*
619219820Sjeff 	 * imm is immediate data for send/RDMA write w/ immediate;
620219820Sjeff 	 * also invalidation key for send with invalidate; input
621