1/*-
2 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include "icl_iser.h"
27
28static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend");
29static int iser_cq_poll_limit = 512;
30
31static void
32iser_cq_event_callback(struct ib_event *cause, void *context)
33{
34	ISER_ERR("got cq event %d", cause->event);
35}
36
37static void
38iser_qp_event_callback(struct ib_event *cause, void *context)
39{
40	ISER_ERR("got qp event %d", cause->event);
41}
42
43static void
44iser_event_handler(struct ib_event_handler *handler,
45				struct ib_event *event)
46{
47	ISER_ERR("async event %d on device %s port %d",
48		 event->event, event->device->name,
49		 event->element.port_num);
50}
51
52/**
53 * is_iser_tx_desc - Indicate if the completion wr_id
54 *     is a TX descriptor or not.
55 * @iser_conn: iser connection
56 * @wr_id: completion WR identifier
57 *
58 * Since we cannot rely on wc opcode in FLUSH errors
59 * we must work around it by checking if the wr_id address
60 * falls in the iser connection rx_descs buffer. If so
61 * it is an RX descriptor, otherwize it is a TX.
62 */
63static inline bool
64is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
65{
66	void *start = iser_conn->rx_descs;
67	u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
68	void *end = (void *)((uintptr_t)start + (uintptr_t)len);
69
70	if (start) {
71		if (wr_id >= start && wr_id < end)
72			return false;
73	} else {
74		return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf);
75	}
76
77	return true;
78}
79
80/**
81 * iser_handle_comp_error() - Handle error completion
82 * @ib_conn:   connection RDMA resources
83 * @wc:        work completion
84 *
85 * Notes: Update post_recv_buf_count in case of recv error completion.
86 *        For non-FLUSH error completion we should also notify iscsi layer that
87 *        connection is failed (in case we passed bind stage).
88 */
89static void
90iser_handle_comp_error(struct ib_conn *ib_conn,
91		       struct ib_wc *wc)
92{
93	void *wr_id = (void *)(uintptr_t)wc->wr_id;
94	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
95						   ib_conn);
96
97	if (is_iser_tx_desc(iser_conn, wr_id)) {
98		ISER_DBG("conn %p got send comp error", iser_conn);
99	} else {
100		ISER_DBG("conn %p got recv comp error", iser_conn);
101		ib_conn->post_recv_buf_count--;
102	}
103	if (wc->status != IB_WC_WR_FLUSH_ERR)
104		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
105}
106
107/**
108 * iser_handle_wc - handle a single work completion
109 * @wc: work completion
110 *
111 * Soft-IRQ context, work completion can be either
112 * SEND or RECV, and can turn out successful or
113 * with error (or flush error).
114 */
115static void iser_handle_wc(struct ib_wc *wc)
116{
117	struct ib_conn *ib_conn;
118	struct iser_tx_desc *tx_desc;
119	struct iser_rx_desc *rx_desc;
120
121	ib_conn = wc->qp->qp_context;
122	if (likely(wc->status == IB_WC_SUCCESS)) {
123		if (wc->opcode == IB_WC_RECV) {
124			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
125			iser_rcv_completion(rx_desc, wc->byte_len,
126					    ib_conn);
127		} else
128		if (wc->opcode == IB_WC_SEND) {
129			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
130			iser_snd_completion(tx_desc, ib_conn);
131		} else {
132			ISER_ERR("Unknown wc opcode %d", wc->opcode);
133		}
134	} else {
135		struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
136					ib_conn);
137		if (wc->status != IB_WC_WR_FLUSH_ERR) {
138			ISER_ERR("conn %p wr id %llx status %d vend_err %x",
139				 iser_conn, (unsigned long long)wc->wr_id,
140				 wc->status, wc->vendor_err);
141		} else {
142			ISER_DBG("flush error: conn %p wr id %llx",
143				 iser_conn, (unsigned long long)wc->wr_id);
144		}
145
146		if (wc->wr_id == ISER_BEACON_WRID) {
147			/* all flush errors were consumed */
148			mtx_lock(&ib_conn->beacon.flush_lock);
149			ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn);
150			cv_signal(&ib_conn->beacon.flush_cv);
151			mtx_unlock(&ib_conn->beacon.flush_lock);
152		} else {
153			iser_handle_comp_error(ib_conn, wc);
154		}
155	}
156}
157
158static void
159iser_cq_tasklet_fn(void *data, int pending)
160{
161	struct iser_comp *comp = (struct iser_comp *)data;
162	struct ib_cq *cq = comp->cq;
163	struct ib_wc *const wcs = comp->wcs;
164	int completed = 0;
165	int i;
166	int n;
167
168	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
169		for (i = 0; i < n; i++)
170			iser_handle_wc(&wcs[i]);
171
172		completed += n;
173		if (completed >= iser_cq_poll_limit)
174			break;
175	}
176
177	/*
178	 * It is assumed here that arming CQ only once its empty
179	 * would not cause interrupts to be missed.
180	 */
181	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
182}
183
184static void
185iser_cq_callback(struct ib_cq *cq, void *cq_context)
186{
187	struct iser_comp *comp = cq_context;
188
189	taskqueue_enqueue(comp->tq, &comp->task);
190}
191
192/**
193 * iser_create_device_ib_res - creates Protection Domain (PD), Completion
194 * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
195 * the adapator.
196 *
197 * returns 0 on success, -1 on failure
198 */
199static int
200iser_create_device_ib_res(struct iser_device *device)
201{
202	struct ib_device *ib_dev = device->ib_device;
203	int i, max_cqe;
204
205	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
206		ISER_ERR("device %s doesn't support Fastreg, "
207			 "can't register memory", device->ib_device->name);
208		return (1);
209	}
210
211	device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors);
212
213	device->comps = malloc(device->comps_used * sizeof(*device->comps),
214		M_ISER_VERBS, M_WAITOK | M_ZERO);
215	if (!device->comps)
216		goto comps_err;
217
218	max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
219
220	ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d",
221		 device->comps_used, device->ib_device->name,
222		 device->ib_device->num_comp_vectors, max_cqe);
223
224	device->pd = ib_alloc_pd(device->ib_device, IB_PD_UNSAFE_GLOBAL_RKEY);
225	if (IS_ERR(device->pd))
226		goto pd_err;
227
228	for (i = 0; i < device->comps_used; i++) {
229		struct iser_comp *comp = &device->comps[i];
230		struct ib_cq_init_attr cq_attr = {
231			.cqe		= max_cqe,
232			.comp_vector	= i,
233		};
234
235		comp->device = device;
236		comp->cq = ib_create_cq(device->ib_device,
237					iser_cq_callback,
238					iser_cq_event_callback,
239					(void *)comp,
240					&cq_attr);
241		if (IS_ERR(comp->cq)) {
242			comp->cq = NULL;
243			goto cq_err;
244		}
245
246		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
247			goto cq_err;
248
249		TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp);
250		comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT,
251				taskqueue_thread_enqueue, &comp->tq);
252		if (!comp->tq)
253			goto tq_err;
254		taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq");
255	}
256
257	device->mr = device->pd->__internal_mr;
258	if (IS_ERR(device->mr))
259		goto tq_err;
260
261	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
262				iser_event_handler);
263	if (ib_register_event_handler(&device->event_handler))
264		goto tq_err;
265
266	return (0);
267
268tq_err:
269	for (i = 0; i < device->comps_used; i++) {
270		struct iser_comp *comp = &device->comps[i];
271		if (comp->tq)
272			taskqueue_free(comp->tq);
273	}
274cq_err:
275	for (i = 0; i < device->comps_used; i++) {
276		struct iser_comp *comp = &device->comps[i];
277		if (comp->cq)
278			ib_destroy_cq(comp->cq);
279	}
280	ib_dealloc_pd(device->pd);
281pd_err:
282	free(device->comps, M_ISER_VERBS);
283comps_err:
284	ISER_ERR("failed to allocate an IB resource");
285	return (1);
286}
287
288/**
289 * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
290 * CQ and PD created with the device associated with the adapator.
291 */
292static void
293iser_free_device_ib_res(struct iser_device *device)
294{
295	int i;
296
297	for (i = 0; i < device->comps_used; i++) {
298		struct iser_comp *comp = &device->comps[i];
299
300		taskqueue_free(comp->tq);
301		ib_destroy_cq(comp->cq);
302		comp->cq = NULL;
303	}
304
305	(void)ib_unregister_event_handler(&device->event_handler);
306	(void)ib_dealloc_pd(device->pd);
307
308	free(device->comps, M_ISER_VERBS);
309	device->comps = NULL;
310
311	device->mr = NULL;
312	device->pd = NULL;
313}
314
315static int
316iser_alloc_reg_res(struct ib_device *ib_device,
317		   struct ib_pd *pd,
318		   struct iser_reg_resources *res)
319{
320	int ret;
321
322	res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, ISCSI_ISER_SG_TABLESIZE + 1);
323	if (IS_ERR(res->mr)) {
324		ret = -PTR_ERR(res->mr);
325		ISER_ERR("Failed to allocate  fast reg mr err=%d", ret);
326		return (ret);
327	}
328	res->mr_valid = 1;
329
330	return (0);
331}
332
333static void
334iser_free_reg_res(struct iser_reg_resources *rsc)
335{
336	ib_dereg_mr(rsc->mr);
337}
338
339static struct fast_reg_descriptor *
340iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd)
341{
342	struct fast_reg_descriptor *desc;
343	int ret;
344
345	desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO);
346	if (!desc) {
347		ISER_ERR("Failed to allocate a new fastreg descriptor");
348		return (NULL);
349	}
350
351	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
352	if (ret) {
353		ISER_ERR("failed to allocate reg_resources");
354		goto err;
355	}
356
357	return (desc);
358err:
359	free(desc, M_ISER_VERBS);
360	return (NULL);
361}
362
363/**
364 * iser_create_fmr_pool - Creates FMR pool and page_vector
365 *
366 * returns 0 on success, or errno code on failure
367 */
368int
369iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
370{
371	struct iser_device *device = ib_conn->device;
372	struct fast_reg_descriptor *desc;
373	int i;
374
375	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
376	ib_conn->fastreg.pool_size = 0;
377	for (i = 0; i < cmds_max; i++) {
378		desc = iser_create_fastreg_desc(device->ib_device, device->pd);
379		if (!desc) {
380			ISER_ERR("Failed to create fastreg descriptor");
381			goto err;
382		}
383
384		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
385		ib_conn->fastreg.pool_size++;
386	}
387
388	return (0);
389
390err:
391	iser_free_fastreg_pool(ib_conn);
392	return (ENOMEM);
393}
394
395/**
396 * iser_free_fmr_pool - releases the FMR pool and page vec
397 */
398void
399iser_free_fastreg_pool(struct ib_conn *ib_conn)
400{
401	struct fast_reg_descriptor *desc, *tmp;
402	int i = 0;
403
404	if (list_empty(&ib_conn->fastreg.pool))
405		return;
406
407	ISER_DBG("freeing conn %p fr pool", ib_conn);
408
409	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
410		list_del(&desc->list);
411		iser_free_reg_res(&desc->rsc);
412		free(desc, M_ISER_VERBS);
413		++i;
414	}
415
416	if (i < ib_conn->fastreg.pool_size)
417		ISER_WARN("pool still has %d regions registered",
418			  ib_conn->fastreg.pool_size - i);
419}
420
421/**
422 * iser_create_ib_conn_res - Queue-Pair (QP)
423 *
424 * returns 0 on success, 1 on failure
425 */
426static int
427iser_create_ib_conn_res(struct ib_conn *ib_conn)
428{
429	struct iser_conn *iser_conn;
430	struct iser_device *device;
431	struct ib_device_attr *dev_attr;
432	struct ib_qp_init_attr init_attr;
433	int index, min_index = 0;
434	int ret = -ENOMEM;
435
436	iser_conn = container_of(ib_conn, struct iser_conn, ib_conn);
437	device = ib_conn->device;
438	dev_attr = &device->dev_attr;
439
440	mtx_lock(&ig.connlist_mutex);
441	/* select the CQ with the minimal number of usages */
442	for (index = 0; index < device->comps_used; index++) {
443		if (device->comps[index].active_qps <
444		    device->comps[min_index].active_qps)
445			min_index = index;
446	}
447	ib_conn->comp = &device->comps[min_index];
448	ib_conn->comp->active_qps++;
449	mtx_unlock(&ig.connlist_mutex);
450	ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn);
451
452	memset(&init_attr, 0, sizeof init_attr);
453	init_attr.event_handler = iser_qp_event_callback;
454	init_attr.qp_context	= (void *)ib_conn;
455	init_attr.send_cq	= ib_conn->comp->cq;
456	init_attr.recv_cq	= ib_conn->comp->cq;
457	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
458	init_attr.cap.max_send_sge = 2;
459	init_attr.cap.max_recv_sge = 1;
460	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
461	init_attr.qp_type	= IB_QPT_RC;
462
463	if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
464		init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
465		iser_conn->max_cmds =
466			ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
467	} else {
468		init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
469		iser_conn->max_cmds =
470			ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
471	}
472	ISER_DBG("device %s supports max_send_wr %d",
473	         device->ib_device->name, dev_attr->max_qp_wr);
474
475	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
476	if (ret)
477		goto out_err;
478
479	ib_conn->qp = ib_conn->cma_id->qp;
480	ISER_DBG("setting conn %p cma_id %p qp %p",
481		 ib_conn, ib_conn->cma_id,
482		 ib_conn->cma_id->qp);
483
484	return (ret);
485
486out_err:
487	mtx_lock(&ig.connlist_mutex);
488	ib_conn->comp->active_qps--;
489	mtx_unlock(&ig.connlist_mutex);
490	ISER_ERR("unable to alloc mem or create resource, err %d", ret);
491
492	return (ret);
493}
494
495/**
496 * based on the resolved device node GUID see if there already allocated
497 * device for this device. If there's no such, create one.
498 */
499static struct iser_device *
500iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
501{
502	struct iser_device *device;
503
504	sx_xlock(&ig.device_list_mutex);
505
506	list_for_each_entry(device, &ig.device_list, ig_list)
507		/* find if there's a match using the node GUID */
508		if (device->ib_device->node_guid == cma_id->device->node_guid)
509			goto inc_refcnt;
510
511	device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO);
512	if (device == NULL)
513		goto out;
514
515	/* assign this device to the device */
516	device->ib_device = cma_id->device;
517	/* init the device and link it into ig device list */
518	if (iser_create_device_ib_res(device)) {
519		free(device, M_ISER_VERBS);
520		device = NULL;
521		goto out;
522	}
523	list_add(&device->ig_list, &ig.device_list);
524
525inc_refcnt:
526	device->refcount++;
527	ISER_INFO("device %p refcount %d", device, device->refcount);
528out:
529	sx_xunlock(&ig.device_list_mutex);
530	return (device);
531}
532
533/* if there's no demand for this device, release it */
534static void
535iser_device_try_release(struct iser_device *device)
536{
537	sx_xlock(&ig.device_list_mutex);
538	device->refcount--;
539	ISER_INFO("device %p refcount %d", device, device->refcount);
540	if (!device->refcount) {
541		iser_free_device_ib_res(device);
542		list_del(&device->ig_list);
543		free(device, M_ISER_VERBS);
544		device = NULL;
545	}
546	sx_xunlock(&ig.device_list_mutex);
547}
548
549/**
550 * Called with state mutex held
551 **/
552static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
553				     enum iser_conn_state comp,
554				     enum iser_conn_state exch)
555{
556	int ret;
557
558	ret = (iser_conn->state == comp);
559	if (ret)
560		iser_conn->state = exch;
561
562	return ret;
563}
564
565/**
566 * iser_free_ib_conn_res - release IB related resources
567 * @iser_conn: iser connection struct
568 * @destroy: indicator if we need to try to release the
569 *     iser device and memory regoins pool (only iscsi
570 *     shutdown and DEVICE_REMOVAL will use this).
571 *
572 * This routine is called with the iser state mutex held
573 * so the cm_id removal is out of here. It is Safe to
574 * be invoked multiple times.
575 */
576void
577iser_free_ib_conn_res(struct iser_conn *iser_conn,
578				  bool destroy)
579{
580	struct ib_conn *ib_conn = &iser_conn->ib_conn;
581	struct iser_device *device = ib_conn->device;
582
583	ISER_INFO("freeing conn %p cma_id %p qp %p",
584		  iser_conn, ib_conn->cma_id, ib_conn->qp);
585
586	if (ib_conn->qp != NULL) {
587		mtx_lock(&ig.connlist_mutex);
588		ib_conn->comp->active_qps--;
589		mtx_unlock(&ig.connlist_mutex);
590		rdma_destroy_qp(ib_conn->cma_id);
591		ib_conn->qp = NULL;
592	}
593
594	if (destroy) {
595		if (iser_conn->login_buf)
596			iser_free_login_buf(iser_conn);
597
598		if (iser_conn->rx_descs)
599			iser_free_rx_descriptors(iser_conn);
600
601		if (device != NULL) {
602			iser_device_try_release(device);
603			ib_conn->device = NULL;
604		}
605	}
606}
607
608/**
609 * triggers start of the disconnect procedures and wait for them to be done
610 * Called with state mutex held
611 */
612int
613iser_conn_terminate(struct iser_conn *iser_conn)
614{
615	struct ib_conn *ib_conn = &iser_conn->ib_conn;
616	const struct ib_send_wr *bad_send_wr;
617	const struct ib_recv_wr *bad_recv_wr;
618	int err = 0;
619
620	/* terminate the iser conn only if the conn state is UP */
621	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
622					   ISER_CONN_TERMINATING))
623		return (0);
624
625	ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state);
626
627	if (ib_conn->qp == NULL) {
628		/* HOW can this be??? */
629		ISER_WARN("qp wasn't created");
630		return (1);
631	}
632
633	/*
634	 * Todo: This is a temporary workaround.
635	 * We serialize the connection closure using global lock in order to
636	 * receive all posted beacons completions.
637	 * Without Serialization, in case we open many connections (QPs) on
638	 * the same CQ, we might miss beacons because of missing interrupts.
639	 */
640	sx_xlock(&ig.close_conns_mutex);
641
642	/*
643	 * In case we didn't already clean up the cma_id (peer initiated
644	 * a disconnection), we need to Cause the CMA to change the QP
645	 * state to ERROR.
646	 */
647	if (ib_conn->cma_id) {
648		err = rdma_disconnect(ib_conn->cma_id);
649		if (err)
650			ISER_ERR("Failed to disconnect, conn: 0x%p err %d",
651				iser_conn, err);
652
653		mtx_lock(&ib_conn->beacon.flush_lock);
654		memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr));
655		ib_conn->beacon.send.wr_id = ISER_BEACON_WRID;
656		ib_conn->beacon.send.opcode = IB_WR_SEND;
657		/* post an indication that all send flush errors were consumed */
658		err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr);
659		if (err) {
660			ISER_ERR("conn %p failed to post send_beacon", ib_conn);
661			mtx_unlock(&ib_conn->beacon.flush_lock);
662			goto out;
663		}
664
665		ISER_DBG("before send cv_wait: %p", iser_conn);
666		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
667		ISER_DBG("after send cv_wait: %p", iser_conn);
668
669		memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr));
670		ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID;
671		/* post an indication that all recv flush errors were consumed */
672		err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr);
673		if (err) {
674			ISER_ERR("conn %p failed to post recv_beacon", ib_conn);
675			mtx_unlock(&ib_conn->beacon.flush_lock);
676			goto out;
677		}
678
679		ISER_DBG("before recv cv_wait: %p", iser_conn);
680		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
681		mtx_unlock(&ib_conn->beacon.flush_lock);
682		ISER_DBG("after recv cv_wait: %p", iser_conn);
683	}
684out:
685	sx_xunlock(&ig.close_conns_mutex);
686	return (1);
687}
688
689/**
690 * Called with state mutex held
691 **/
692static void
693iser_connect_error(struct rdma_cm_id *cma_id)
694{
695	struct iser_conn *iser_conn;
696
697	iser_conn = cma_id->context;
698
699	ISER_ERR("conn %p", iser_conn);
700
701	iser_conn->state = ISER_CONN_TERMINATING;
702
703	cv_signal(&iser_conn->up_cv);
704}
705
706/**
707 * Called with state mutex held
708 **/
709static void
710iser_addr_handler(struct rdma_cm_id *cma_id)
711{
712	struct iser_device *device;
713	struct iser_conn   *iser_conn;
714	struct ib_conn   *ib_conn;
715	int    ret;
716
717	iser_conn = cma_id->context;
718
719	ib_conn = &iser_conn->ib_conn;
720	device = iser_device_find_by_ib_device(cma_id);
721	if (!device) {
722		ISER_ERR("conn %p device lookup/creation failed",
723			 iser_conn);
724		iser_connect_error(cma_id);
725		return;
726	}
727
728	ib_conn->device = device;
729
730	ret = rdma_resolve_route(cma_id, 1000);
731	if (ret) {
732		ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret);
733		iser_connect_error(cma_id);
734		return;
735	}
736}
737
738/**
739 * Called with state mutex held
740 **/
741static void
742iser_route_handler(struct rdma_cm_id *cma_id)
743{
744	struct rdma_conn_param conn_param;
745	int    ret;
746	struct iser_cm_hdr req_hdr;
747	struct iser_conn *iser_conn = cma_id->context;
748	struct ib_conn *ib_conn = &iser_conn->ib_conn;
749	struct iser_device *device = ib_conn->device;
750
751	ret = iser_create_ib_conn_res(ib_conn);
752	if (ret)
753		goto failure;
754
755	memset(&conn_param, 0, sizeof conn_param);
756	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
757	conn_param.retry_count	       = 7;
758	conn_param.rnr_retry_count     = 6;
759	/*
760	 * Initiaotr depth should not be set, but in order to compat
761	 * with old targets, we keep this value set.
762	 */
763	conn_param.initiator_depth     = 1;
764
765	memset(&req_hdr, 0, sizeof(req_hdr));
766	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
767			ISER_SEND_W_INV_NOT_SUPPORTED);
768	conn_param.private_data		= (void *)&req_hdr;
769	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
770
771	ret = rdma_connect(cma_id, &conn_param);
772	if (ret) {
773		ISER_ERR("conn %p failure connecting: %d", iser_conn, ret);
774		goto failure;
775	}
776
777	return;
778failure:
779	iser_connect_error(cma_id);
780}
781
782/**
783 * Called with state mutex held
784 **/
785static void
786iser_connected_handler(struct rdma_cm_id *cma_id)
787{
788	struct iser_conn *iser_conn;
789	struct ib_qp_attr attr;
790	struct ib_qp_init_attr init_attr;
791
792	iser_conn = cma_id->context;
793
794	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
795
796	ISER_INFO("remote qpn:%x my qpn:%x",
797		  attr.dest_qp_num, cma_id->qp->qp_num);
798
799	iser_conn->state = ISER_CONN_UP;
800
801	cv_signal(&iser_conn->up_cv);
802}
803
804/**
805 * Called with state mutex held
806 **/
807static void
808iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy)
809{
810	struct iser_conn *iser_conn = cma_id->context;
811
812	if (iser_conn_terminate(iser_conn))
813		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
814
815}
816
817int
818iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
819{
820	struct iser_conn *iser_conn;
821	int ret = 0;
822
823	iser_conn = cma_id->context;
824	ISER_INFO("event %d status %d conn %p id %p",
825		  event->event, event->status, cma_id->context, cma_id);
826
827	sx_xlock(&iser_conn->state_mutex);
828	switch (event->event) {
829	case RDMA_CM_EVENT_ADDR_RESOLVED:
830		iser_addr_handler(cma_id);
831		break;
832	case RDMA_CM_EVENT_ROUTE_RESOLVED:
833		iser_route_handler(cma_id);
834		break;
835	case RDMA_CM_EVENT_ESTABLISHED:
836		iser_connected_handler(cma_id);
837		break;
838	case RDMA_CM_EVENT_ADDR_ERROR:
839	case RDMA_CM_EVENT_ROUTE_ERROR:
840	case RDMA_CM_EVENT_CONNECT_ERROR:
841	case RDMA_CM_EVENT_UNREACHABLE:
842	case RDMA_CM_EVENT_REJECTED:
843		iser_connect_error(cma_id);
844		break;
845	case RDMA_CM_EVENT_DISCONNECTED:
846	case RDMA_CM_EVENT_ADDR_CHANGE:
847	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
848		iser_cleanup_handler(cma_id, false);
849		break;
850	default:
851		ISER_ERR("Unexpected RDMA CM event (%d)", event->event);
852		break;
853	}
854	sx_xunlock(&iser_conn->state_mutex);
855
856	return (ret);
857}
858
859int
860iser_post_recvl(struct iser_conn *iser_conn)
861{
862	const struct ib_recv_wr *rx_wr_failed;
863	struct ib_recv_wr rx_wr;
864	struct ib_conn *ib_conn = &iser_conn->ib_conn;
865	struct ib_sge	  sge;
866	int ib_ret;
867
868	sge.addr   = iser_conn->login_resp_dma;
869	sge.length = ISER_RX_LOGIN_SIZE;
870	sge.lkey   = ib_conn->device->mr->lkey;
871
872	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
873	rx_wr.sg_list = &sge;
874	rx_wr.num_sge = 1;
875	rx_wr.next    = NULL;
876
877	ib_conn->post_recv_buf_count++;
878	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
879	if (ib_ret) {
880		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
881		ib_conn->post_recv_buf_count--;
882	}
883
884	return (ib_ret);
885}
886
887int
888iser_post_recvm(struct iser_conn *iser_conn, int count)
889{
890	const struct ib_recv_wr *rx_wr_failed;
891	struct ib_recv_wr *rx_wr;
892	int i, ib_ret;
893	struct ib_conn *ib_conn = &iser_conn->ib_conn;
894	unsigned int my_rx_head = iser_conn->rx_desc_head;
895	struct iser_rx_desc *rx_desc;
896
897	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
898		rx_desc		= &iser_conn->rx_descs[my_rx_head];
899		rx_wr->wr_id	= (uintptr_t)rx_desc;
900		rx_wr->sg_list	= &rx_desc->rx_sg;
901		rx_wr->num_sge	= 1;
902		rx_wr->next	= rx_wr + 1;
903		my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos;
904	}
905
906	rx_wr--;
907	rx_wr->next = NULL; /* mark end of work requests list */
908
909	ib_conn->post_recv_buf_count += count;
910	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
911	if (ib_ret) {
912		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
913		ib_conn->post_recv_buf_count -= count;
914	} else
915		iser_conn->rx_desc_head = my_rx_head;
916
917	return (ib_ret);
918}
919
920/**
921 * iser_start_send - Initiate a Send DTO operation
922 *
923 * returns 0 on success, -1 on failure
924 */
925int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
926		   bool signal)
927{
928	int		  ib_ret;
929	const struct ib_send_wr *send_wr_failed;
930	struct ib_send_wr send_wr;
931
932	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
933				      tx_desc->dma_addr, ISER_HEADERS_LEN,
934				      DMA_TO_DEVICE);
935
936	send_wr.next	   = NULL;
937	send_wr.wr_id	   = (uintptr_t)tx_desc;
938	send_wr.sg_list	   = tx_desc->tx_sg;
939	send_wr.num_sge	   = tx_desc->num_sge;
940	send_wr.opcode	   = IB_WR_SEND;
941	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
942
943	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
944	if (ib_ret)
945		ISER_ERR("ib_post_send failed, ret:%d", ib_ret);
946
947	return (ib_ret);
948}
949