1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4/* Copyright (c) 2008-2019, IBM Corporation */
5
6#include <linux/errno.h>
7#include <linux/types.h>
8#include <linux/uaccess.h>
9#include <linux/vmalloc.h>
10#include <linux/xarray.h>
11#include <net/addrconf.h>
12
13#include <rdma/iw_cm.h>
14#include <rdma/ib_verbs.h>
15#include <rdma/ib_user_verbs.h>
16#include <rdma/uverbs_ioctl.h>
17
18#include "siw.h"
19#include "siw_verbs.h"
20#include "siw_mem.h"
21
22static int siw_qp_state_to_ib_qp_state[SIW_QP_STATE_COUNT] = {
23	[SIW_QP_STATE_IDLE] = IB_QPS_INIT,
24	[SIW_QP_STATE_RTR] = IB_QPS_RTR,
25	[SIW_QP_STATE_RTS] = IB_QPS_RTS,
26	[SIW_QP_STATE_CLOSING] = IB_QPS_SQD,
27	[SIW_QP_STATE_TERMINATE] = IB_QPS_SQE,
28	[SIW_QP_STATE_ERROR] = IB_QPS_ERR
29};
30
31static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
32	[IB_QPS_RESET] = SIW_QP_STATE_IDLE,
33	[IB_QPS_INIT] = SIW_QP_STATE_IDLE,
34	[IB_QPS_RTR] = SIW_QP_STATE_RTR,
35	[IB_QPS_RTS] = SIW_QP_STATE_RTS,
36	[IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
37	[IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
38	[IB_QPS_ERR] = SIW_QP_STATE_ERROR
39};
40
41static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
42	[IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
43	[IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
44	[IB_QPS_ERR] = "ERR"
45};
46
47void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
48{
49	struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
50
51	kfree(entry);
52}
53
54int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
55{
56	struct siw_ucontext *uctx = to_siw_ctx(ctx);
57	size_t size = vma->vm_end - vma->vm_start;
58	struct rdma_user_mmap_entry *rdma_entry;
59	struct siw_user_mmap_entry *entry;
60	int rv = -EINVAL;
61
62	/*
63	 * Must be page aligned
64	 */
65	if (vma->vm_start & (PAGE_SIZE - 1)) {
66		pr_warn("siw: mmap not page aligned\n");
67		return -EINVAL;
68	}
69	rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
70	if (!rdma_entry) {
71		siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
72			vma->vm_pgoff, size);
73		return -EINVAL;
74	}
75	entry = to_siw_mmap_entry(rdma_entry);
76
77	rv = remap_vmalloc_range(vma, entry->address, 0);
78	if (rv)
79		pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
80			size);
81	rdma_user_mmap_entry_put(rdma_entry);
82
83	return rv;
84}
85
86int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
87{
88	struct siw_device *sdev = to_siw_dev(base_ctx->device);
89	struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
90	struct siw_uresp_alloc_ctx uresp = {};
91	int rv;
92
93	if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
94		rv = -ENOMEM;
95		goto err_out;
96	}
97	ctx->sdev = sdev;
98
99	uresp.dev_id = sdev->vendor_part_id;
100
101	if (udata->outlen < sizeof(uresp)) {
102		rv = -EINVAL;
103		goto err_out;
104	}
105	rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
106	if (rv)
107		goto err_out;
108
109	siw_dbg(base_ctx->device, "success. now %d context(s)\n",
110		atomic_read(&sdev->num_ctx));
111
112	return 0;
113
114err_out:
115	atomic_dec(&sdev->num_ctx);
116	siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
117		atomic_read(&sdev->num_ctx));
118
119	return rv;
120}
121
122void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
123{
124	struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
125
126	atomic_dec(&uctx->sdev->num_ctx);
127}
128
129int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
130		     struct ib_udata *udata)
131{
132	struct siw_device *sdev = to_siw_dev(base_dev);
133
134	if (udata->inlen || udata->outlen)
135		return -EINVAL;
136
137	memset(attr, 0, sizeof(*attr));
138
139	/* Revisit atomic caps if RFC 7306 gets supported */
140	attr->atomic_cap = 0;
141	attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
142	attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG;
143	attr->max_cq = sdev->attrs.max_cq;
144	attr->max_cqe = sdev->attrs.max_cqe;
145	attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
146	attr->max_mr = sdev->attrs.max_mr;
147	attr->max_mw = sdev->attrs.max_mw;
148	attr->max_mr_size = ~0ull;
149	attr->max_pd = sdev->attrs.max_pd;
150	attr->max_qp = sdev->attrs.max_qp;
151	attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
152	attr->max_qp_rd_atom = sdev->attrs.max_ord;
153	attr->max_qp_wr = sdev->attrs.max_qp_wr;
154	attr->max_recv_sge = sdev->attrs.max_sge;
155	attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
156	attr->max_send_sge = sdev->attrs.max_sge;
157	attr->max_sge_rd = sdev->attrs.max_sge_rd;
158	attr->max_srq = sdev->attrs.max_srq;
159	attr->max_srq_sge = sdev->attrs.max_srq_sge;
160	attr->max_srq_wr = sdev->attrs.max_srq_wr;
161	attr->page_size_cap = PAGE_SIZE;
162	attr->vendor_id = SIW_VENDOR_ID;
163	attr->vendor_part_id = sdev->vendor_part_id;
164
165	addrconf_addr_eui48((u8 *)&attr->sys_image_guid,
166			    sdev->raw_gid);
167
168	return 0;
169}
170
171int siw_query_port(struct ib_device *base_dev, u32 port,
172		   struct ib_port_attr *attr)
173{
174	struct siw_device *sdev = to_siw_dev(base_dev);
175	int rv;
176
177	memset(attr, 0, sizeof(*attr));
178
179	rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
180			 &attr->active_width);
181	attr->gid_tbl_len = 1;
182	attr->max_msg_sz = -1;
183	attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
184	attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
185	attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
186		IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
187	attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
188	attr->state = sdev->state;
189	/*
190	 * All zero
191	 *
192	 * attr->lid = 0;
193	 * attr->bad_pkey_cntr = 0;
194	 * attr->qkey_viol_cntr = 0;
195	 * attr->sm_lid = 0;
196	 * attr->lmc = 0;
197	 * attr->max_vl_num = 0;
198	 * attr->sm_sl = 0;
199	 * attr->subnet_timeout = 0;
200	 * attr->init_type_repy = 0;
201	 */
202	return rv;
203}
204
205int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
206			   struct ib_port_immutable *port_immutable)
207{
208	struct ib_port_attr attr;
209	int rv = siw_query_port(base_dev, port, &attr);
210
211	if (rv)
212		return rv;
213
214	port_immutable->gid_tbl_len = attr.gid_tbl_len;
215	port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
216
217	return 0;
218}
219
220int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
221		  union ib_gid *gid)
222{
223	struct siw_device *sdev = to_siw_dev(base_dev);
224
225	/* subnet_prefix == interface_id == 0; */
226	memset(gid, 0, sizeof(*gid));
227	memcpy(gid->raw, sdev->raw_gid, ETH_ALEN);
228
229	return 0;
230}
231
232int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
233{
234	struct siw_device *sdev = to_siw_dev(pd->device);
235
236	if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
237		atomic_dec(&sdev->num_pd);
238		return -ENOMEM;
239	}
240	siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
241
242	return 0;
243}
244
245int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
246{
247	struct siw_device *sdev = to_siw_dev(pd->device);
248
249	siw_dbg_pd(pd, "free PD\n");
250	atomic_dec(&sdev->num_pd);
251	return 0;
252}
253
254void siw_qp_get_ref(struct ib_qp *base_qp)
255{
256	siw_qp_get(to_siw_qp(base_qp));
257}
258
259void siw_qp_put_ref(struct ib_qp *base_qp)
260{
261	siw_qp_put(to_siw_qp(base_qp));
262}
263
264static struct rdma_user_mmap_entry *
265siw_mmap_entry_insert(struct siw_ucontext *uctx,
266		      void *address, size_t length,
267		      u64 *offset)
268{
269	struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
270	int rv;
271
272	*offset = SIW_INVAL_UOBJ_KEY;
273	if (!entry)
274		return NULL;
275
276	entry->address = address;
277
278	rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
279					 &entry->rdma_entry,
280					 length);
281	if (rv) {
282		kfree(entry);
283		return NULL;
284	}
285
286	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
287
288	return &entry->rdma_entry;
289}
290
291/*
292 * siw_create_qp()
293 *
294 * Create QP of requested size on given device.
295 *
296 * @qp:		Queue pait
297 * @attrs:	Initial QP attributes.
298 * @udata:	used to provide QP ID, SQ and RQ size back to user.
299 */
300
301int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
302		  struct ib_udata *udata)
303{
304	struct ib_pd *pd = ibqp->pd;
305	struct siw_qp *qp = to_siw_qp(ibqp);
306	struct ib_device *base_dev = pd->device;
307	struct siw_device *sdev = to_siw_dev(base_dev);
308	struct siw_ucontext *uctx =
309		rdma_udata_to_drv_context(udata, struct siw_ucontext,
310					  base_ucontext);
311	unsigned long flags;
312	int num_sqe, num_rqe, rv = 0;
313	size_t length;
314
315	siw_dbg(base_dev, "create new QP\n");
316
317	if (attrs->create_flags)
318		return -EOPNOTSUPP;
319
320	if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
321		siw_dbg(base_dev, "too many QP's\n");
322		rv = -ENOMEM;
323		goto err_atomic;
324	}
325	if (attrs->qp_type != IB_QPT_RC) {
326		siw_dbg(base_dev, "only RC QP's supported\n");
327		rv = -EOPNOTSUPP;
328		goto err_atomic;
329	}
330	if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
331	    (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
332	    (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
333	    (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
334		siw_dbg(base_dev, "QP size error\n");
335		rv = -EINVAL;
336		goto err_atomic;
337	}
338	if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
339		siw_dbg(base_dev, "max inline send: %d > %d\n",
340			attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
341		rv = -EINVAL;
342		goto err_atomic;
343	}
344	/*
345	 * NOTE: we don't allow for a QP unable to hold any SQ WQE
346	 */
347	if (attrs->cap.max_send_wr == 0) {
348		siw_dbg(base_dev, "QP must have send queue\n");
349		rv = -EINVAL;
350		goto err_atomic;
351	}
352
353	if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) {
354		siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
355		rv = -EINVAL;
356		goto err_atomic;
357	}
358
359	init_rwsem(&qp->state_lock);
360	spin_lock_init(&qp->sq_lock);
361	spin_lock_init(&qp->rq_lock);
362	spin_lock_init(&qp->orq_lock);
363
364	rv = siw_qp_add(sdev, qp);
365	if (rv)
366		goto err_atomic;
367
368
369	/* All queue indices are derived from modulo operations
370	 * on a free running 'get' (consumer) and 'put' (producer)
371	 * unsigned counter. Having queue sizes at power of two
372	 * avoids handling counter wrap around.
373	 */
374	num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
375	num_rqe = attrs->cap.max_recv_wr;
376	if (num_rqe)
377		num_rqe = roundup_pow_of_two(num_rqe);
378
379	if (udata)
380		qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
381	else
382		qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe));
383
384	if (qp->sendq == NULL) {
385		rv = -ENOMEM;
386		goto err_out_xa;
387	}
388	if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
389		if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
390			qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
391		else {
392			rv = -EINVAL;
393			goto err_out_xa;
394		}
395	}
396	qp->pd = pd;
397	qp->scq = to_siw_cq(attrs->send_cq);
398	qp->rcq = to_siw_cq(attrs->recv_cq);
399
400	if (attrs->srq) {
401		/*
402		 * SRQ support.
403		 * Verbs 6.3.7: ignore RQ size, if SRQ present
404		 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
405		 */
406		qp->srq = to_siw_srq(attrs->srq);
407		qp->attrs.rq_size = 0;
408		siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
409			qp->base_qp.qp_num);
410	} else if (num_rqe) {
411		if (udata)
412			qp->recvq =
413				vmalloc_user(num_rqe * sizeof(struct siw_rqe));
414		else
415			qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe));
416
417		if (qp->recvq == NULL) {
418			rv = -ENOMEM;
419			goto err_out_xa;
420		}
421		qp->attrs.rq_size = num_rqe;
422	}
423	qp->attrs.sq_size = num_sqe;
424	qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
425	qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
426
427	/* Make those two tunables fixed for now. */
428	qp->tx_ctx.gso_seg_limit = 1;
429	qp->tx_ctx.zcopy_tx = zcopy_tx;
430
431	qp->attrs.state = SIW_QP_STATE_IDLE;
432
433	if (udata) {
434		struct siw_uresp_create_qp uresp = {};
435
436		uresp.num_sqe = num_sqe;
437		uresp.num_rqe = num_rqe;
438		uresp.qp_id = qp_id(qp);
439
440		if (qp->sendq) {
441			length = num_sqe * sizeof(struct siw_sqe);
442			qp->sq_entry =
443				siw_mmap_entry_insert(uctx, qp->sendq,
444						      length, &uresp.sq_key);
445			if (!qp->sq_entry) {
446				rv = -ENOMEM;
447				goto err_out_xa;
448			}
449		}
450
451		if (qp->recvq) {
452			length = num_rqe * sizeof(struct siw_rqe);
453			qp->rq_entry =
454				siw_mmap_entry_insert(uctx, qp->recvq,
455						      length, &uresp.rq_key);
456			if (!qp->rq_entry) {
457				uresp.sq_key = SIW_INVAL_UOBJ_KEY;
458				rv = -ENOMEM;
459				goto err_out_xa;
460			}
461		}
462
463		if (udata->outlen < sizeof(uresp)) {
464			rv = -EINVAL;
465			goto err_out_xa;
466		}
467		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
468		if (rv)
469			goto err_out_xa;
470	}
471	qp->tx_cpu = siw_get_tx_cpu(sdev);
472	if (qp->tx_cpu < 0) {
473		rv = -EINVAL;
474		goto err_out_xa;
475	}
476	INIT_LIST_HEAD(&qp->devq);
477	spin_lock_irqsave(&sdev->lock, flags);
478	list_add_tail(&qp->devq, &sdev->qp_list);
479	spin_unlock_irqrestore(&sdev->lock, flags);
480
481	init_completion(&qp->qp_free);
482
483	return 0;
484
485err_out_xa:
486	xa_erase(&sdev->qp_xa, qp_id(qp));
487	if (uctx) {
488		rdma_user_mmap_entry_remove(qp->sq_entry);
489		rdma_user_mmap_entry_remove(qp->rq_entry);
490	}
491	vfree(qp->sendq);
492	vfree(qp->recvq);
493
494err_atomic:
495	atomic_dec(&sdev->num_qp);
496	return rv;
497}
498
499/*
500 * Minimum siw_query_qp() verb interface.
501 *
502 * @qp_attr_mask is not used but all available information is provided
503 */
504int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
505		 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
506{
507	struct siw_qp *qp;
508	struct siw_device *sdev;
509
510	if (base_qp && qp_attr && qp_init_attr) {
511		qp = to_siw_qp(base_qp);
512		sdev = to_siw_dev(base_qp->device);
513	} else {
514		return -EINVAL;
515	}
516	qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state];
517	qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
518	qp_attr->cap.max_send_wr = qp->attrs.sq_size;
519	qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
520	qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
521	qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
522	qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
523	qp_attr->max_rd_atomic = qp->attrs.irq_size;
524	qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
525
526	qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
527				   IB_ACCESS_REMOTE_WRITE |
528				   IB_ACCESS_REMOTE_READ;
529
530	qp_init_attr->qp_type = base_qp->qp_type;
531	qp_init_attr->send_cq = base_qp->send_cq;
532	qp_init_attr->recv_cq = base_qp->recv_cq;
533	qp_init_attr->srq = base_qp->srq;
534
535	qp_init_attr->cap = qp_attr->cap;
536
537	return 0;
538}
539
540int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
541			int attr_mask, struct ib_udata *udata)
542{
543	struct siw_qp_attrs new_attrs;
544	enum siw_qp_attr_mask siw_attr_mask = 0;
545	struct siw_qp *qp = to_siw_qp(base_qp);
546	int rv = 0;
547
548	if (!attr_mask)
549		return 0;
550
551	if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
552		return -EOPNOTSUPP;
553
554	memset(&new_attrs, 0, sizeof(new_attrs));
555
556	if (attr_mask & IB_QP_ACCESS_FLAGS) {
557		siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
558
559		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
560			new_attrs.flags |= SIW_RDMA_READ_ENABLED;
561		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
562			new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
563		if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
564			new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
565	}
566	if (attr_mask & IB_QP_STATE) {
567		siw_dbg_qp(qp, "desired IB QP state: %s\n",
568			   ib_qp_state_to_string[attr->qp_state]);
569
570		new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
571
572		if (new_attrs.state > SIW_QP_STATE_RTS)
573			qp->tx_ctx.tx_suspend = 1;
574
575		siw_attr_mask |= SIW_QP_ATTR_STATE;
576	}
577	if (!siw_attr_mask)
578		goto out;
579
580	down_write(&qp->state_lock);
581
582	rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
583
584	up_write(&qp->state_lock);
585out:
586	return rv;
587}
588
589int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
590{
591	struct siw_qp *qp = to_siw_qp(base_qp);
592	struct siw_ucontext *uctx =
593		rdma_udata_to_drv_context(udata, struct siw_ucontext,
594					  base_ucontext);
595	struct siw_qp_attrs qp_attrs;
596
597	siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
598
599	/*
600	 * Mark QP as in process of destruction to prevent from
601	 * any async callbacks to RDMA core
602	 */
603	qp->attrs.flags |= SIW_QP_IN_DESTROY;
604	qp->rx_stream.rx_suspend = 1;
605
606	if (uctx) {
607		rdma_user_mmap_entry_remove(qp->sq_entry);
608		rdma_user_mmap_entry_remove(qp->rq_entry);
609	}
610
611	down_write(&qp->state_lock);
612
613	qp_attrs.state = SIW_QP_STATE_ERROR;
614	siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
615
616	if (qp->cep) {
617		siw_cep_put(qp->cep);
618		qp->cep = NULL;
619	}
620	up_write(&qp->state_lock);
621
622	kfree(qp->tx_ctx.mpa_crc_hd);
623	kfree(qp->rx_stream.mpa_crc_hd);
624
625	qp->scq = qp->rcq = NULL;
626
627	siw_qp_put(qp);
628	wait_for_completion(&qp->qp_free);
629
630	return 0;
631}
632
633/*
634 * siw_copy_inline_sgl()
635 *
636 * Prepare sgl of inlined data for sending. For userland callers
637 * function checks if given buffer addresses and len's are within
638 * process context bounds.
639 * Data from all provided sge's are copied together into the wqe,
640 * referenced by a single sge.
641 */
642static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
643			       struct siw_sqe *sqe)
644{
645	struct ib_sge *core_sge = core_wr->sg_list;
646	void *kbuf = &sqe->sge[1];
647	int num_sge = core_wr->num_sge, bytes = 0;
648
649	sqe->sge[0].laddr = (uintptr_t)kbuf;
650	sqe->sge[0].lkey = 0;
651
652	while (num_sge--) {
653		if (!core_sge->length) {
654			core_sge++;
655			continue;
656		}
657		bytes += core_sge->length;
658		if (bytes > SIW_MAX_INLINE) {
659			bytes = -EINVAL;
660			break;
661		}
662		memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr),
663		       core_sge->length);
664
665		kbuf += core_sge->length;
666		core_sge++;
667	}
668	sqe->sge[0].length = max(bytes, 0);
669	sqe->num_sge = bytes > 0 ? 1 : 0;
670
671	return bytes;
672}
673
674/* Complete SQ WR's without processing */
675static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
676			   const struct ib_send_wr **bad_wr)
677{
678	int rv = 0;
679
680	while (wr) {
681		struct siw_sqe sqe = {};
682
683		switch (wr->opcode) {
684		case IB_WR_RDMA_WRITE:
685			sqe.opcode = SIW_OP_WRITE;
686			break;
687		case IB_WR_RDMA_READ:
688			sqe.opcode = SIW_OP_READ;
689			break;
690		case IB_WR_RDMA_READ_WITH_INV:
691			sqe.opcode = SIW_OP_READ_LOCAL_INV;
692			break;
693		case IB_WR_SEND:
694			sqe.opcode = SIW_OP_SEND;
695			break;
696		case IB_WR_SEND_WITH_IMM:
697			sqe.opcode = SIW_OP_SEND_WITH_IMM;
698			break;
699		case IB_WR_SEND_WITH_INV:
700			sqe.opcode = SIW_OP_SEND_REMOTE_INV;
701			break;
702		case IB_WR_LOCAL_INV:
703			sqe.opcode = SIW_OP_INVAL_STAG;
704			break;
705		case IB_WR_REG_MR:
706			sqe.opcode = SIW_OP_REG_MR;
707			break;
708		default:
709			rv = -EINVAL;
710			break;
711		}
712		if (!rv) {
713			sqe.id = wr->wr_id;
714			rv = siw_sqe_complete(qp, &sqe, 0,
715					      SIW_WC_WR_FLUSH_ERR);
716		}
717		if (rv) {
718			if (bad_wr)
719				*bad_wr = wr;
720			break;
721		}
722		wr = wr->next;
723	}
724	return rv;
725}
726
727/* Complete RQ WR's without processing */
728static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
729			   const struct ib_recv_wr **bad_wr)
730{
731	struct siw_rqe rqe = {};
732	int rv = 0;
733
734	while (wr) {
735		rqe.id = wr->wr_id;
736		rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
737		if (rv) {
738			if (bad_wr)
739				*bad_wr = wr;
740			break;
741		}
742		wr = wr->next;
743	}
744	return rv;
745}
746
747/*
748 * siw_post_send()
749 *
750 * Post a list of S-WR's to a SQ.
751 *
752 * @base_qp:	Base QP contained in siw QP
753 * @wr:		Null terminated list of user WR's
754 * @bad_wr:	Points to failing WR in case of synchronous failure.
755 */
756int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
757		  const struct ib_send_wr **bad_wr)
758{
759	struct siw_qp *qp = to_siw_qp(base_qp);
760	struct siw_wqe *wqe = tx_wqe(qp);
761
762	unsigned long flags;
763	int rv = 0;
764
765	if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
766		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
767		*bad_wr = wr;
768		return -EINVAL;
769	}
770
771	/*
772	 * Try to acquire QP state lock. Must be non-blocking
773	 * to accommodate kernel clients needs.
774	 */
775	if (!down_read_trylock(&qp->state_lock)) {
776		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
777			/*
778			 * ERROR state is final, so we can be sure
779			 * this state will not change as long as the QP
780			 * exists.
781			 *
782			 * This handles an ib_drain_sq() call with
783			 * a concurrent request to set the QP state
784			 * to ERROR.
785			 */
786			rv = siw_sq_flush_wr(qp, wr, bad_wr);
787		} else {
788			siw_dbg_qp(qp, "QP locked, state %d\n",
789				   qp->attrs.state);
790			*bad_wr = wr;
791			rv = -ENOTCONN;
792		}
793		return rv;
794	}
795	if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
796		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
797			/*
798			 * Immediately flush this WR to CQ, if QP
799			 * is in ERROR state. SQ is guaranteed to
800			 * be empty, so WR complets in-order.
801			 *
802			 * Typically triggered by ib_drain_sq().
803			 */
804			rv = siw_sq_flush_wr(qp, wr, bad_wr);
805		} else {
806			siw_dbg_qp(qp, "QP out of state %d\n",
807				   qp->attrs.state);
808			*bad_wr = wr;
809			rv = -ENOTCONN;
810		}
811		up_read(&qp->state_lock);
812		return rv;
813	}
814	spin_lock_irqsave(&qp->sq_lock, flags);
815
816	while (wr) {
817		u32 idx = qp->sq_put % qp->attrs.sq_size;
818		struct siw_sqe *sqe = &qp->sendq[idx];
819
820		if (sqe->flags) {
821			siw_dbg_qp(qp, "sq full\n");
822			rv = -ENOMEM;
823			break;
824		}
825		if (wr->num_sge > qp->attrs.sq_max_sges) {
826			siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
827			rv = -EINVAL;
828			break;
829		}
830		sqe->id = wr->wr_id;
831
832		if ((wr->send_flags & IB_SEND_SIGNALED) ||
833		    (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
834			sqe->flags |= SIW_WQE_SIGNALLED;
835
836		if (wr->send_flags & IB_SEND_FENCE)
837			sqe->flags |= SIW_WQE_READ_FENCE;
838
839		switch (wr->opcode) {
840		case IB_WR_SEND:
841		case IB_WR_SEND_WITH_INV:
842			if (wr->send_flags & IB_SEND_SOLICITED)
843				sqe->flags |= SIW_WQE_SOLICITED;
844
845			if (!(wr->send_flags & IB_SEND_INLINE)) {
846				siw_copy_sgl(wr->sg_list, sqe->sge,
847					     wr->num_sge);
848				sqe->num_sge = wr->num_sge;
849			} else {
850				rv = siw_copy_inline_sgl(wr, sqe);
851				if (rv <= 0) {
852					rv = -EINVAL;
853					break;
854				}
855				sqe->flags |= SIW_WQE_INLINE;
856				sqe->num_sge = 1;
857			}
858			if (wr->opcode == IB_WR_SEND)
859				sqe->opcode = SIW_OP_SEND;
860			else {
861				sqe->opcode = SIW_OP_SEND_REMOTE_INV;
862				sqe->rkey = wr->ex.invalidate_rkey;
863			}
864			break;
865
866		case IB_WR_RDMA_READ_WITH_INV:
867		case IB_WR_RDMA_READ:
868			/*
869			 * iWarp restricts RREAD sink to SGL containing
870			 * 1 SGE only. we could relax to SGL with multiple
871			 * elements referring the SAME ltag or even sending
872			 * a private per-rreq tag referring to a checked
873			 * local sgl with MULTIPLE ltag's.
874			 */
875			if (unlikely(wr->num_sge != 1)) {
876				rv = -EINVAL;
877				break;
878			}
879			siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
880			/*
881			 * NOTE: zero length RREAD is allowed!
882			 */
883			sqe->raddr = rdma_wr(wr)->remote_addr;
884			sqe->rkey = rdma_wr(wr)->rkey;
885			sqe->num_sge = 1;
886
887			if (wr->opcode == IB_WR_RDMA_READ)
888				sqe->opcode = SIW_OP_READ;
889			else
890				sqe->opcode = SIW_OP_READ_LOCAL_INV;
891			break;
892
893		case IB_WR_RDMA_WRITE:
894			if (!(wr->send_flags & IB_SEND_INLINE)) {
895				siw_copy_sgl(wr->sg_list, &sqe->sge[0],
896					     wr->num_sge);
897				sqe->num_sge = wr->num_sge;
898			} else {
899				rv = siw_copy_inline_sgl(wr, sqe);
900				if (unlikely(rv < 0)) {
901					rv = -EINVAL;
902					break;
903				}
904				sqe->flags |= SIW_WQE_INLINE;
905				sqe->num_sge = 1;
906			}
907			sqe->raddr = rdma_wr(wr)->remote_addr;
908			sqe->rkey = rdma_wr(wr)->rkey;
909			sqe->opcode = SIW_OP_WRITE;
910			break;
911
912		case IB_WR_REG_MR:
913			sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
914			sqe->rkey = reg_wr(wr)->key;
915			sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
916			sqe->opcode = SIW_OP_REG_MR;
917			break;
918
919		case IB_WR_LOCAL_INV:
920			sqe->rkey = wr->ex.invalidate_rkey;
921			sqe->opcode = SIW_OP_INVAL_STAG;
922			break;
923
924		default:
925			siw_dbg_qp(qp, "ib wr type %d unsupported\n",
926				   wr->opcode);
927			rv = -EINVAL;
928			break;
929		}
930		siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
931			   sqe->opcode, sqe->flags,
932			   (void *)(uintptr_t)sqe->id);
933
934		if (unlikely(rv < 0))
935			break;
936
937		/* make SQE only valid after completely written */
938		smp_wmb();
939		sqe->flags |= SIW_WQE_VALID;
940
941		qp->sq_put++;
942		wr = wr->next;
943	}
944
945	/*
946	 * Send directly if SQ processing is not in progress.
947	 * Eventual immediate errors (rv < 0) do not affect the involved
948	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
949	 * processing, if new work is already pending. But rv must be passed
950	 * to caller.
951	 */
952	if (wqe->wr_status != SIW_WR_IDLE) {
953		spin_unlock_irqrestore(&qp->sq_lock, flags);
954		goto skip_direct_sending;
955	}
956	rv = siw_activate_tx(qp);
957	spin_unlock_irqrestore(&qp->sq_lock, flags);
958
959	if (rv <= 0)
960		goto skip_direct_sending;
961
962	if (rdma_is_kernel_res(&qp->base_qp.res)) {
963		rv = siw_sq_start(qp);
964	} else {
965		qp->tx_ctx.in_syscall = 1;
966
967		if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
968			siw_qp_cm_drop(qp, 0);
969
970		qp->tx_ctx.in_syscall = 0;
971	}
972skip_direct_sending:
973
974	up_read(&qp->state_lock);
975
976	if (rv >= 0)
977		return 0;
978	/*
979	 * Immediate error
980	 */
981	siw_dbg_qp(qp, "error %d\n", rv);
982
983	*bad_wr = wr;
984	return rv;
985}
986
987/*
988 * siw_post_receive()
989 *
990 * Post a list of R-WR's to a RQ.
991 *
992 * @base_qp:	Base QP contained in siw QP
993 * @wr:		Null terminated list of user WR's
994 * @bad_wr:	Points to failing WR in case of synchronous failure.
995 */
996int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
997		     const struct ib_recv_wr **bad_wr)
998{
999	struct siw_qp *qp = to_siw_qp(base_qp);
1000	unsigned long flags;
1001	int rv = 0;
1002
1003	if (qp->srq || qp->attrs.rq_size == 0) {
1004		*bad_wr = wr;
1005		return -EINVAL;
1006	}
1007	if (!rdma_is_kernel_res(&qp->base_qp.res)) {
1008		siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
1009		*bad_wr = wr;
1010		return -EINVAL;
1011	}
1012
1013	/*
1014	 * Try to acquire QP state lock. Must be non-blocking
1015	 * to accommodate kernel clients needs.
1016	 */
1017	if (!down_read_trylock(&qp->state_lock)) {
1018		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1019			/*
1020			 * ERROR state is final, so we can be sure
1021			 * this state will not change as long as the QP
1022			 * exists.
1023			 *
1024			 * This handles an ib_drain_rq() call with
1025			 * a concurrent request to set the QP state
1026			 * to ERROR.
1027			 */
1028			rv = siw_rq_flush_wr(qp, wr, bad_wr);
1029		} else {
1030			siw_dbg_qp(qp, "QP locked, state %d\n",
1031				   qp->attrs.state);
1032			*bad_wr = wr;
1033			rv = -ENOTCONN;
1034		}
1035		return rv;
1036	}
1037	if (qp->attrs.state > SIW_QP_STATE_RTS) {
1038		if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1039			/*
1040			 * Immediately flush this WR to CQ, if QP
1041			 * is in ERROR state. RQ is guaranteed to
1042			 * be empty, so WR complets in-order.
1043			 *
1044			 * Typically triggered by ib_drain_rq().
1045			 */
1046			rv = siw_rq_flush_wr(qp, wr, bad_wr);
1047		} else {
1048			siw_dbg_qp(qp, "QP out of state %d\n",
1049				   qp->attrs.state);
1050			*bad_wr = wr;
1051			rv = -ENOTCONN;
1052		}
1053		up_read(&qp->state_lock);
1054		return rv;
1055	}
1056	/*
1057	 * Serialize potentially multiple producers.
1058	 * Not needed for single threaded consumer side.
1059	 */
1060	spin_lock_irqsave(&qp->rq_lock, flags);
1061
1062	while (wr) {
1063		u32 idx = qp->rq_put % qp->attrs.rq_size;
1064		struct siw_rqe *rqe = &qp->recvq[idx];
1065
1066		if (rqe->flags) {
1067			siw_dbg_qp(qp, "RQ full\n");
1068			rv = -ENOMEM;
1069			break;
1070		}
1071		if (wr->num_sge > qp->attrs.rq_max_sges) {
1072			siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
1073			rv = -EINVAL;
1074			break;
1075		}
1076		rqe->id = wr->wr_id;
1077		rqe->num_sge = wr->num_sge;
1078		siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1079
1080		/* make sure RQE is completely written before valid */
1081		smp_wmb();
1082
1083		rqe->flags = SIW_WQE_VALID;
1084
1085		qp->rq_put++;
1086		wr = wr->next;
1087	}
1088	spin_unlock_irqrestore(&qp->rq_lock, flags);
1089
1090	up_read(&qp->state_lock);
1091
1092	if (rv < 0) {
1093		siw_dbg_qp(qp, "error %d\n", rv);
1094		*bad_wr = wr;
1095	}
1096	return rv > 0 ? 0 : rv;
1097}
1098
1099int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
1100{
1101	struct siw_cq *cq = to_siw_cq(base_cq);
1102	struct siw_device *sdev = to_siw_dev(base_cq->device);
1103	struct siw_ucontext *ctx =
1104		rdma_udata_to_drv_context(udata, struct siw_ucontext,
1105					  base_ucontext);
1106
1107	siw_dbg_cq(cq, "free CQ resources\n");
1108
1109	siw_cq_flush(cq);
1110
1111	if (ctx)
1112		rdma_user_mmap_entry_remove(cq->cq_entry);
1113
1114	atomic_dec(&sdev->num_cq);
1115
1116	vfree(cq->queue);
1117	return 0;
1118}
1119
1120/*
1121 * siw_create_cq()
1122 *
1123 * Populate CQ of requested size
1124 *
1125 * @base_cq: CQ as allocated by RDMA midlayer
1126 * @attr: Initial CQ attributes
1127 * @udata: relates to user context
1128 */
1129
1130int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1131		  struct ib_udata *udata)
1132{
1133	struct siw_device *sdev = to_siw_dev(base_cq->device);
1134	struct siw_cq *cq = to_siw_cq(base_cq);
1135	int rv, size = attr->cqe;
1136
1137	if (attr->flags)
1138		return -EOPNOTSUPP;
1139
1140	if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1141		siw_dbg(base_cq->device, "too many CQ's\n");
1142		rv = -ENOMEM;
1143		goto err_out;
1144	}
1145	if (size < 1 || size > sdev->attrs.max_cqe) {
1146		siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1147		rv = -EINVAL;
1148		goto err_out;
1149	}
1150	size = roundup_pow_of_two(size);
1151	cq->base_cq.cqe = size;
1152	cq->num_cqe = size;
1153
1154	if (udata)
1155		cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1156					 sizeof(struct siw_cq_ctrl));
1157	else
1158		cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1159				    sizeof(struct siw_cq_ctrl));
1160
1161	if (cq->queue == NULL) {
1162		rv = -ENOMEM;
1163		goto err_out;
1164	}
1165	get_random_bytes(&cq->id, 4);
1166	siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1167
1168	spin_lock_init(&cq->lock);
1169
1170	cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1171
1172	if (udata) {
1173		struct siw_uresp_create_cq uresp = {};
1174		struct siw_ucontext *ctx =
1175			rdma_udata_to_drv_context(udata, struct siw_ucontext,
1176						  base_ucontext);
1177		size_t length = size * sizeof(struct siw_cqe) +
1178			sizeof(struct siw_cq_ctrl);
1179
1180		cq->cq_entry =
1181			siw_mmap_entry_insert(ctx, cq->queue,
1182					      length, &uresp.cq_key);
1183		if (!cq->cq_entry) {
1184			rv = -ENOMEM;
1185			goto err_out;
1186		}
1187
1188		uresp.cq_id = cq->id;
1189		uresp.num_cqe = size;
1190
1191		if (udata->outlen < sizeof(uresp)) {
1192			rv = -EINVAL;
1193			goto err_out;
1194		}
1195		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1196		if (rv)
1197			goto err_out;
1198	}
1199	return 0;
1200
1201err_out:
1202	siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1203
1204	if (cq->queue) {
1205		struct siw_ucontext *ctx =
1206			rdma_udata_to_drv_context(udata, struct siw_ucontext,
1207						  base_ucontext);
1208		if (ctx)
1209			rdma_user_mmap_entry_remove(cq->cq_entry);
1210		vfree(cq->queue);
1211	}
1212	atomic_dec(&sdev->num_cq);
1213
1214	return rv;
1215}
1216
1217/*
1218 * siw_poll_cq()
1219 *
1220 * Reap CQ entries if available and copy work completion status into
1221 * array of WC's provided by caller. Returns number of reaped CQE's.
1222 *
1223 * @base_cq:	Base CQ contained in siw CQ.
1224 * @num_cqe:	Maximum number of CQE's to reap.
1225 * @wc:		Array of work completions to be filled by siw.
1226 */
1227int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1228{
1229	struct siw_cq *cq = to_siw_cq(base_cq);
1230	int i;
1231
1232	for (i = 0; i < num_cqe; i++) {
1233		if (!siw_reap_cqe(cq, wc))
1234			break;
1235		wc++;
1236	}
1237	return i;
1238}
1239
1240/*
1241 * siw_req_notify_cq()
1242 *
1243 * Request notification for new CQE's added to that CQ.
1244 * Defined flags:
1245 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1246 *   event if a WQE with notification flag set enters the CQ
1247 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1248 *   event if a WQE enters the CQ.
1249 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1250 *   number of not reaped CQE's regardless of its notification
1251 *   type and current or new CQ notification settings.
1252 *
1253 * @base_cq:	Base CQ contained in siw CQ.
1254 * @flags:	Requested notification flags.
1255 */
1256int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1257{
1258	struct siw_cq *cq = to_siw_cq(base_cq);
1259
1260	siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1261
1262	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1263		/*
1264		 * Enable CQ event for next solicited completion.
1265		 * and make it visible to all associated producers.
1266		 */
1267		smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1268	else
1269		/*
1270		 * Enable CQ event for any signalled completion.
1271		 * and make it visible to all associated producers.
1272		 */
1273		smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1274
1275	if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1276		return cq->cq_put - cq->cq_get;
1277
1278	return 0;
1279}
1280
1281/*
1282 * siw_dereg_mr()
1283 *
1284 * Release Memory Region.
1285 *
1286 * @base_mr: Base MR contained in siw MR.
1287 * @udata: points to user context, unused.
1288 */
1289int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1290{
1291	struct siw_mr *mr = to_siw_mr(base_mr);
1292	struct siw_device *sdev = to_siw_dev(base_mr->device);
1293
1294	siw_dbg_mem(mr->mem, "deregister MR\n");
1295
1296	atomic_dec(&sdev->num_mr);
1297
1298	siw_mr_drop_mem(mr);
1299	kfree_rcu(mr, rcu);
1300
1301	return 0;
1302}
1303
1304/*
1305 * siw_reg_user_mr()
1306 *
1307 * Register Memory Region.
1308 *
1309 * @pd:		Protection Domain
1310 * @start:	starting address of MR (virtual address)
1311 * @len:	len of MR
1312 * @rnic_va:	not used by siw
1313 * @rights:	MR access rights
1314 * @udata:	user buffer to communicate STag and Key.
1315 */
1316struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1317			      u64 rnic_va, int rights, struct ib_udata *udata)
1318{
1319	struct siw_mr *mr = NULL;
1320	struct siw_umem *umem = NULL;
1321	struct siw_ureq_reg_mr ureq;
1322	struct siw_device *sdev = to_siw_dev(pd->device);
1323	int rv;
1324
1325	siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1326		   (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1327		   (unsigned long long)len);
1328
1329	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1330		siw_dbg_pd(pd, "too many mr's\n");
1331		rv = -ENOMEM;
1332		goto err_out;
1333	}
1334	if (!len) {
1335		rv = -EINVAL;
1336		goto err_out;
1337	}
1338	umem = siw_umem_get(pd->device, start, len, rights);
1339	if (IS_ERR(umem)) {
1340		rv = PTR_ERR(umem);
1341		siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1342		umem = NULL;
1343		goto err_out;
1344	}
1345	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1346	if (!mr) {
1347		rv = -ENOMEM;
1348		goto err_out;
1349	}
1350	rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1351	if (rv)
1352		goto err_out;
1353
1354	if (udata) {
1355		struct siw_uresp_reg_mr uresp = {};
1356		struct siw_mem *mem = mr->mem;
1357
1358		if (udata->inlen < sizeof(ureq)) {
1359			rv = -EINVAL;
1360			goto err_out;
1361		}
1362		rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1363		if (rv)
1364			goto err_out;
1365
1366		mr->base_mr.lkey |= ureq.stag_key;
1367		mr->base_mr.rkey |= ureq.stag_key;
1368		mem->stag |= ureq.stag_key;
1369		uresp.stag = mem->stag;
1370
1371		if (udata->outlen < sizeof(uresp)) {
1372			rv = -EINVAL;
1373			goto err_out;
1374		}
1375		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1376		if (rv)
1377			goto err_out;
1378	}
1379	mr->mem->stag_valid = 1;
1380
1381	return &mr->base_mr;
1382
1383err_out:
1384	atomic_dec(&sdev->num_mr);
1385	if (mr) {
1386		if (mr->mem)
1387			siw_mr_drop_mem(mr);
1388		kfree_rcu(mr, rcu);
1389	} else {
1390		if (umem)
1391			siw_umem_release(umem);
1392	}
1393	return ERR_PTR(rv);
1394}
1395
1396struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1397			   u32 max_sge)
1398{
1399	struct siw_device *sdev = to_siw_dev(pd->device);
1400	struct siw_mr *mr = NULL;
1401	struct siw_pbl *pbl = NULL;
1402	int rv;
1403
1404	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1405		siw_dbg_pd(pd, "too many mr's\n");
1406		rv = -ENOMEM;
1407		goto err_out;
1408	}
1409	if (mr_type != IB_MR_TYPE_MEM_REG) {
1410		siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1411		rv = -EOPNOTSUPP;
1412		goto err_out;
1413	}
1414	if (max_sge > SIW_MAX_SGE_PBL) {
1415		siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1416		rv = -ENOMEM;
1417		goto err_out;
1418	}
1419	pbl = siw_pbl_alloc(max_sge);
1420	if (IS_ERR(pbl)) {
1421		rv = PTR_ERR(pbl);
1422		siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1423		pbl = NULL;
1424		goto err_out;
1425	}
1426	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1427	if (!mr) {
1428		rv = -ENOMEM;
1429		goto err_out;
1430	}
1431	rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1432	if (rv)
1433		goto err_out;
1434
1435	mr->mem->is_pbl = 1;
1436
1437	siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1438
1439	return &mr->base_mr;
1440
1441err_out:
1442	atomic_dec(&sdev->num_mr);
1443
1444	if (!mr) {
1445		kfree(pbl);
1446	} else {
1447		if (mr->mem)
1448			siw_mr_drop_mem(mr);
1449		kfree_rcu(mr, rcu);
1450	}
1451	siw_dbg_pd(pd, "failed: %d\n", rv);
1452
1453	return ERR_PTR(rv);
1454}
1455
1456/* Just used to count number of pages being mapped */
1457static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1458{
1459	return 0;
1460}
1461
1462int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1463		  unsigned int *sg_off)
1464{
1465	struct scatterlist *slp;
1466	struct siw_mr *mr = to_siw_mr(base_mr);
1467	struct siw_mem *mem = mr->mem;
1468	struct siw_pbl *pbl = mem->pbl;
1469	struct siw_pble *pble;
1470	unsigned long pbl_size;
1471	int i, rv;
1472
1473	if (!pbl) {
1474		siw_dbg_mem(mem, "no PBL allocated\n");
1475		return -EINVAL;
1476	}
1477	pble = pbl->pbe;
1478
1479	if (pbl->max_buf < num_sle) {
1480		siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1481			    num_sle, pbl->max_buf);
1482		return -ENOMEM;
1483	}
1484	for_each_sg(sl, slp, num_sle, i) {
1485		if (sg_dma_len(slp) == 0) {
1486			siw_dbg_mem(mem, "empty SGE\n");
1487			return -EINVAL;
1488		}
1489		if (i == 0) {
1490			pble->addr = sg_dma_address(slp);
1491			pble->size = sg_dma_len(slp);
1492			pble->pbl_off = 0;
1493			pbl_size = pble->size;
1494			pbl->num_buf = 1;
1495		} else {
1496			/* Merge PBL entries if adjacent */
1497			if (pble->addr + pble->size == sg_dma_address(slp)) {
1498				pble->size += sg_dma_len(slp);
1499			} else {
1500				pble++;
1501				pbl->num_buf++;
1502				pble->addr = sg_dma_address(slp);
1503				pble->size = sg_dma_len(slp);
1504				pble->pbl_off = pbl_size;
1505			}
1506			pbl_size += sg_dma_len(slp);
1507		}
1508		siw_dbg_mem(mem,
1509			"sge[%d], size %u, addr 0x%p, total %lu\n",
1510			i, pble->size, ib_virt_dma_to_ptr(pble->addr),
1511			pbl_size);
1512	}
1513	rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1514	if (rv > 0) {
1515		mem->len = base_mr->length;
1516		mem->va = base_mr->iova;
1517		siw_dbg_mem(mem,
1518			"%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1519			mem->len, (void *)(uintptr_t)mem->va, num_sle,
1520			pbl->num_buf);
1521	}
1522	return rv;
1523}
1524
1525/*
1526 * siw_get_dma_mr()
1527 *
1528 * Create a (empty) DMA memory region, where no umem is attached.
1529 */
1530struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1531{
1532	struct siw_device *sdev = to_siw_dev(pd->device);
1533	struct siw_mr *mr = NULL;
1534	int rv;
1535
1536	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1537		siw_dbg_pd(pd, "too many mr's\n");
1538		rv = -ENOMEM;
1539		goto err_out;
1540	}
1541	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1542	if (!mr) {
1543		rv = -ENOMEM;
1544		goto err_out;
1545	}
1546	rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1547	if (rv)
1548		goto err_out;
1549
1550	mr->mem->stag_valid = 1;
1551
1552	siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1553
1554	return &mr->base_mr;
1555
1556err_out:
1557	if (rv)
1558		kfree(mr);
1559
1560	atomic_dec(&sdev->num_mr);
1561
1562	return ERR_PTR(rv);
1563}
1564
1565/*
1566 * siw_create_srq()
1567 *
1568 * Create Shared Receive Queue of attributes @init_attrs
1569 * within protection domain given by @pd.
1570 *
1571 * @base_srq:	Base SRQ contained in siw SRQ.
1572 * @init_attrs:	SRQ init attributes.
1573 * @udata:	points to user context
1574 */
1575int siw_create_srq(struct ib_srq *base_srq,
1576		   struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1577{
1578	struct siw_srq *srq = to_siw_srq(base_srq);
1579	struct ib_srq_attr *attrs = &init_attrs->attr;
1580	struct siw_device *sdev = to_siw_dev(base_srq->device);
1581	struct siw_ucontext *ctx =
1582		rdma_udata_to_drv_context(udata, struct siw_ucontext,
1583					  base_ucontext);
1584	int rv;
1585
1586	if (init_attrs->srq_type != IB_SRQT_BASIC)
1587		return -EOPNOTSUPP;
1588
1589	if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1590		siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1591		rv = -ENOMEM;
1592		goto err_out;
1593	}
1594	if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1595	    attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1596		rv = -EINVAL;
1597		goto err_out;
1598	}
1599	srq->max_sge = attrs->max_sge;
1600	srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1601	srq->limit = attrs->srq_limit;
1602	if (srq->limit)
1603		srq->armed = true;
1604
1605	srq->is_kernel_res = !udata;
1606
1607	if (udata)
1608		srq->recvq =
1609			vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1610	else
1611		srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe));
1612
1613	if (srq->recvq == NULL) {
1614		rv = -ENOMEM;
1615		goto err_out;
1616	}
1617	if (udata) {
1618		struct siw_uresp_create_srq uresp = {};
1619		size_t length = srq->num_rqe * sizeof(struct siw_rqe);
1620
1621		srq->srq_entry =
1622			siw_mmap_entry_insert(ctx, srq->recvq,
1623					      length, &uresp.srq_key);
1624		if (!srq->srq_entry) {
1625			rv = -ENOMEM;
1626			goto err_out;
1627		}
1628
1629		uresp.num_rqe = srq->num_rqe;
1630
1631		if (udata->outlen < sizeof(uresp)) {
1632			rv = -EINVAL;
1633			goto err_out;
1634		}
1635		rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1636		if (rv)
1637			goto err_out;
1638	}
1639	spin_lock_init(&srq->lock);
1640
1641	siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1642
1643	return 0;
1644
1645err_out:
1646	if (srq->recvq) {
1647		if (ctx)
1648			rdma_user_mmap_entry_remove(srq->srq_entry);
1649		vfree(srq->recvq);
1650	}
1651	atomic_dec(&sdev->num_srq);
1652
1653	return rv;
1654}
1655
1656/*
1657 * siw_modify_srq()
1658 *
1659 * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1660 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1661 *
1662 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1663 * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1664 */
1665int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1666		   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1667{
1668	struct siw_srq *srq = to_siw_srq(base_srq);
1669	unsigned long flags;
1670	int rv = 0;
1671
1672	spin_lock_irqsave(&srq->lock, flags);
1673
1674	if (attr_mask & IB_SRQ_MAX_WR) {
1675		/* resize request not yet supported */
1676		rv = -EOPNOTSUPP;
1677		goto out;
1678	}
1679	if (attr_mask & IB_SRQ_LIMIT) {
1680		if (attrs->srq_limit) {
1681			if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1682				rv = -EINVAL;
1683				goto out;
1684			}
1685			srq->armed = true;
1686		} else {
1687			srq->armed = false;
1688		}
1689		srq->limit = attrs->srq_limit;
1690	}
1691out:
1692	spin_unlock_irqrestore(&srq->lock, flags);
1693
1694	return rv;
1695}
1696
1697/*
1698 * siw_query_srq()
1699 *
1700 * Query SRQ attributes.
1701 */
1702int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1703{
1704	struct siw_srq *srq = to_siw_srq(base_srq);
1705	unsigned long flags;
1706
1707	spin_lock_irqsave(&srq->lock, flags);
1708
1709	attrs->max_wr = srq->num_rqe;
1710	attrs->max_sge = srq->max_sge;
1711	attrs->srq_limit = srq->limit;
1712
1713	spin_unlock_irqrestore(&srq->lock, flags);
1714
1715	return 0;
1716}
1717
1718/*
1719 * siw_destroy_srq()
1720 *
1721 * Destroy SRQ.
1722 * It is assumed that the SRQ is not referenced by any
1723 * QP anymore - the code trusts the RDMA core environment to keep track
1724 * of QP references.
1725 */
1726int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1727{
1728	struct siw_srq *srq = to_siw_srq(base_srq);
1729	struct siw_device *sdev = to_siw_dev(base_srq->device);
1730	struct siw_ucontext *ctx =
1731		rdma_udata_to_drv_context(udata, struct siw_ucontext,
1732					  base_ucontext);
1733
1734	if (ctx)
1735		rdma_user_mmap_entry_remove(srq->srq_entry);
1736	vfree(srq->recvq);
1737	atomic_dec(&sdev->num_srq);
1738	return 0;
1739}
1740
1741/*
1742 * siw_post_srq_recv()
1743 *
1744 * Post a list of receive queue elements to SRQ.
1745 * NOTE: The function does not check or lock a certain SRQ state
1746 *       during the post operation. The code simply trusts the
1747 *       RDMA core environment.
1748 *
1749 * @base_srq:	Base SRQ contained in siw SRQ
1750 * @wr:		List of R-WR's
1751 * @bad_wr:	Updated to failing WR if posting fails.
1752 */
1753int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1754		      const struct ib_recv_wr **bad_wr)
1755{
1756	struct siw_srq *srq = to_siw_srq(base_srq);
1757	unsigned long flags;
1758	int rv = 0;
1759
1760	if (unlikely(!srq->is_kernel_res)) {
1761		siw_dbg_pd(base_srq->pd,
1762			   "[SRQ]: no kernel post_recv for mapped srq\n");
1763		rv = -EINVAL;
1764		goto out;
1765	}
1766	/*
1767	 * Serialize potentially multiple producers.
1768	 * Also needed to serialize potentially multiple
1769	 * consumers.
1770	 */
1771	spin_lock_irqsave(&srq->lock, flags);
1772
1773	while (wr) {
1774		u32 idx = srq->rq_put % srq->num_rqe;
1775		struct siw_rqe *rqe = &srq->recvq[idx];
1776
1777		if (rqe->flags) {
1778			siw_dbg_pd(base_srq->pd, "SRQ full\n");
1779			rv = -ENOMEM;
1780			break;
1781		}
1782		if (unlikely(wr->num_sge > srq->max_sge)) {
1783			siw_dbg_pd(base_srq->pd,
1784				   "[SRQ]: too many sge's: %d\n", wr->num_sge);
1785			rv = -EINVAL;
1786			break;
1787		}
1788		rqe->id = wr->wr_id;
1789		rqe->num_sge = wr->num_sge;
1790		siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1791
1792		/* Make sure S-RQE is completely written before valid */
1793		smp_wmb();
1794
1795		rqe->flags = SIW_WQE_VALID;
1796
1797		srq->rq_put++;
1798		wr = wr->next;
1799	}
1800	spin_unlock_irqrestore(&srq->lock, flags);
1801out:
1802	if (unlikely(rv < 0)) {
1803		siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1804		*bad_wr = wr;
1805	}
1806	return rv;
1807}
1808
1809void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1810{
1811	struct ib_event event;
1812	struct ib_qp *base_qp = &qp->base_qp;
1813
1814	/*
1815	 * Do not report asynchronous errors on QP which gets
1816	 * destroyed via verbs interface (siw_destroy_qp())
1817	 */
1818	if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1819		return;
1820
1821	event.event = etype;
1822	event.device = base_qp->device;
1823	event.element.qp = base_qp;
1824
1825	if (base_qp->event_handler) {
1826		siw_dbg_qp(qp, "reporting event %d\n", etype);
1827		base_qp->event_handler(&event, base_qp->qp_context);
1828	}
1829}
1830
1831void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1832{
1833	struct ib_event event;
1834	struct ib_cq *base_cq = &cq->base_cq;
1835
1836	event.event = etype;
1837	event.device = base_cq->device;
1838	event.element.cq = base_cq;
1839
1840	if (base_cq->event_handler) {
1841		siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1842		base_cq->event_handler(&event, base_cq->cq_context);
1843	}
1844}
1845
1846void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1847{
1848	struct ib_event event;
1849	struct ib_srq *base_srq = &srq->base_srq;
1850
1851	event.event = etype;
1852	event.device = base_srq->device;
1853	event.element.srq = base_srq;
1854
1855	if (base_srq->event_handler) {
1856		siw_dbg_pd(srq->base_srq.pd,
1857			   "reporting SRQ event %d\n", etype);
1858		base_srq->event_handler(&event, base_srq->srq_context);
1859	}
1860}
1861
1862void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
1863{
1864	struct ib_event event;
1865
1866	event.event = etype;
1867	event.device = &sdev->base_dev;
1868	event.element.port_num = port;
1869
1870	siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1871
1872	ib_dispatch_event(&event);
1873}
1874