1/* $FreeBSD: releng/11.0/sys/dev/iser/iser_verbs.c 300837 2016-05-27 11:37:02Z trasz $ */
2/*-
3 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include "icl_iser.h"
28
29static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend");
30static int iser_cq_poll_limit = 512;
31
32static void
33iser_cq_event_callback(struct ib_event *cause, void *context)
34{
35	ISER_ERR("got cq event %d", cause->event);
36}
37
38static void
39iser_qp_event_callback(struct ib_event *cause, void *context)
40{
41	ISER_ERR("got qp event %d", cause->event);
42}
43
44static void
45iser_event_handler(struct ib_event_handler *handler,
46				struct ib_event *event)
47{
48	ISER_ERR("async event %d on device %s port %d",
49		 event->event, event->device->name,
50		 event->element.port_num);
51}
52
53/**
54 * is_iser_tx_desc - Indicate if the completion wr_id
55 *     is a TX descriptor or not.
56 * @iser_conn: iser connection
57 * @wr_id: completion WR identifier
58 *
59 * Since we cannot rely on wc opcode in FLUSH errors
60 * we must work around it by checking if the wr_id address
61 * falls in the iser connection rx_descs buffer. If so
62 * it is an RX descriptor, otherwize it is a TX.
63 */
64static inline bool
65is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
66{
67	void *start = iser_conn->rx_descs;
68	u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
69	void *end = (void *)((uintptr_t)start + (uintptr_t)len);
70
71	if (start) {
72		if (wr_id >= start && wr_id < end)
73			return false;
74	} else {
75		return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf);
76	}
77
78	return true;
79}
80
81/**
82 * iser_handle_comp_error() - Handle error completion
83 * @ib_conn:   connection RDMA resources
84 * @wc:        work completion
85 *
86 * Notes: Update post_recv_buf_count in case of recv error completion.
87 *        For non-FLUSH error completion we should also notify iscsi layer that
88 *        connection is failed (in case we passed bind stage).
89 */
90static void
91iser_handle_comp_error(struct ib_conn *ib_conn,
92		       struct ib_wc *wc)
93{
94	void *wr_id = (void *)(uintptr_t)wc->wr_id;
95	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
96						   ib_conn);
97
98	if (is_iser_tx_desc(iser_conn, wr_id)) {
99		ISER_DBG("conn %p got send comp error", iser_conn);
100	} else {
101		ISER_DBG("conn %p got recv comp error", iser_conn);
102		ib_conn->post_recv_buf_count--;
103	}
104	if (wc->status != IB_WC_WR_FLUSH_ERR)
105		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
106}
107
108/**
109 * iser_handle_wc - handle a single work completion
110 * @wc: work completion
111 *
112 * Soft-IRQ context, work completion can be either
113 * SEND or RECV, and can turn out successful or
114 * with error (or flush error).
115 */
116static void iser_handle_wc(struct ib_wc *wc)
117{
118	struct ib_conn *ib_conn;
119	struct iser_tx_desc *tx_desc;
120	struct iser_rx_desc *rx_desc;
121
122	ib_conn = wc->qp->qp_context;
123	if (likely(wc->status == IB_WC_SUCCESS)) {
124		if (wc->opcode == IB_WC_RECV) {
125			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
126			iser_rcv_completion(rx_desc, wc->byte_len,
127					    ib_conn);
128		} else
129		if (wc->opcode == IB_WC_SEND) {
130			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
131			iser_snd_completion(tx_desc, ib_conn);
132		} else {
133			ISER_ERR("Unknown wc opcode %d", wc->opcode);
134		}
135	} else {
136		struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
137					ib_conn);
138		if (wc->status != IB_WC_WR_FLUSH_ERR) {
139			ISER_ERR("conn %p wr id %llx status %d vend_err %x",
140				 iser_conn, (unsigned long long)wc->wr_id,
141				 wc->status, wc->vendor_err);
142		} else {
143			ISER_DBG("flush error: conn %p wr id %llx",
144				 iser_conn, (unsigned long long)wc->wr_id);
145		}
146
147		if (wc->wr_id == ISER_BEACON_WRID) {
148			/* all flush errors were consumed */
149			mtx_lock(&ib_conn->beacon.flush_lock);
150			ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn);
151			cv_signal(&ib_conn->beacon.flush_cv);
152			mtx_unlock(&ib_conn->beacon.flush_lock);
153		} else {
154			iser_handle_comp_error(ib_conn, wc);
155		}
156	}
157}
158
159static void
160iser_cq_tasklet_fn(void *data, int pending)
161{
162	struct iser_comp *comp = (struct iser_comp *)data;
163	struct ib_cq *cq = comp->cq;
164	struct ib_wc *const wcs = comp->wcs;
165	int completed = 0;
166	int i;
167	int n;
168
169	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
170		for (i = 0; i < n; i++)
171			iser_handle_wc(&wcs[i]);
172
173		completed += n;
174		if (completed >= iser_cq_poll_limit)
175			break;
176	}
177
178	/*
179	 * It is assumed here that arming CQ only once its empty
180	 * would not cause interrupts to be missed.
181	 */
182	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
183}
184
185static void
186iser_cq_callback(struct ib_cq *cq, void *cq_context)
187{
188	struct iser_comp *comp = cq_context;
189
190	taskqueue_enqueue(comp->tq, &comp->task);
191}
192
193/**
194 * iser_create_device_ib_res - creates Protection Domain (PD), Completion
195 * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
196 * the adapator.
197 *
198 * returns 0 on success, -1 on failure
199 */
200static int
201iser_create_device_ib_res(struct iser_device *device)
202{
203	struct ib_device_attr *dev_attr = &device->dev_attr;
204	int ret, i, max_cqe;
205
206	ret = ib_query_device(device->ib_device, dev_attr);
207	if (ret) {
208		ISER_ERR("Query device failed for %s", device->ib_device->name);
209		return (ret);
210	}
211
212	if (!(dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
213		ISER_ERR("device %s doesn't support Fastreg, "
214			 "can't register memory", device->ib_device->name);
215		return (1);
216	}
217
218	device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors);
219
220	device->comps = malloc(device->comps_used * sizeof(*device->comps),
221		M_ISER_VERBS, M_WAITOK | M_ZERO);
222	if (!device->comps)
223		goto comps_err;
224
225	max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
226
227	ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d",
228		 device->comps_used, device->ib_device->name,
229		 device->ib_device->num_comp_vectors, max_cqe);
230
231	device->pd = ib_alloc_pd(device->ib_device);
232	if (IS_ERR(device->pd))
233		goto pd_err;
234
235	for (i = 0; i < device->comps_used; i++) {
236		struct iser_comp *comp = &device->comps[i];
237
238		comp->device = device;
239		comp->cq = ib_create_cq(device->ib_device,
240					iser_cq_callback,
241					iser_cq_event_callback,
242					(void *)comp,
243					max_cqe, i);
244		if (IS_ERR(comp->cq)) {
245			comp->cq = NULL;
246			goto cq_err;
247		}
248
249		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
250			goto cq_err;
251
252		TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp);
253		comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT,
254				taskqueue_thread_enqueue, &comp->tq);
255		if (!comp->tq)
256			goto tq_err;
257		taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq");
258	}
259
260	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
261				   IB_ACCESS_REMOTE_WRITE |
262				   IB_ACCESS_REMOTE_READ);
263	if (IS_ERR(device->mr))
264		goto tq_err;
265
266	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
267				iser_event_handler);
268	if (ib_register_event_handler(&device->event_handler))
269		goto handler_err;
270
271	return (0);
272
273handler_err:
274	ib_dereg_mr(device->mr);
275tq_err:
276	for (i = 0; i < device->comps_used; i++) {
277		struct iser_comp *comp = &device->comps[i];
278		if (comp->tq)
279			taskqueue_free(comp->tq);
280	}
281cq_err:
282	for (i = 0; i < device->comps_used; i++) {
283		struct iser_comp *comp = &device->comps[i];
284		if (comp->cq)
285			ib_destroy_cq(comp->cq);
286	}
287	ib_dealloc_pd(device->pd);
288pd_err:
289	free(device->comps, M_ISER_VERBS);
290comps_err:
291	ISER_ERR("failed to allocate an IB resource");
292	return (1);
293}
294
295/**
296 * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
297 * CQ and PD created with the device associated with the adapator.
298 */
299static void
300iser_free_device_ib_res(struct iser_device *device)
301{
302	int i;
303
304	for (i = 0; i < device->comps_used; i++) {
305		struct iser_comp *comp = &device->comps[i];
306
307		taskqueue_free(comp->tq);
308		ib_destroy_cq(comp->cq);
309		comp->cq = NULL;
310	}
311
312	(void)ib_unregister_event_handler(&device->event_handler);
313	(void)ib_dereg_mr(device->mr);
314	(void)ib_dealloc_pd(device->pd);
315
316	free(device->comps, M_ISER_VERBS);
317	device->comps = NULL;
318
319	device->mr = NULL;
320	device->pd = NULL;
321}
322
323static int
324iser_alloc_reg_res(struct ib_device *ib_device,
325		   struct ib_pd *pd,
326		   struct iser_reg_resources *res)
327{
328	int ret;
329
330	res->frpl = ib_alloc_fast_reg_page_list(ib_device,
331						ISCSI_ISER_SG_TABLESIZE + 1);
332	if (IS_ERR(res->frpl)) {
333		ret = -PTR_ERR(res->frpl);
334		ISER_ERR("Failed to allocate fast reg page list err=%d", ret);
335		return (ret);
336	}
337
338	res->mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
339	if (IS_ERR(res->mr)) {
340		ret = -PTR_ERR(res->mr);
341		ISER_ERR("Failed to allocate  fast reg mr err=%d", ret);
342		goto fast_reg_mr_failure;
343	}
344	res->mr_valid = 1;
345
346	return (0);
347
348fast_reg_mr_failure:
349	ib_free_fast_reg_page_list(res->frpl);
350
351	return (ret);
352}
353
354static void
355iser_free_reg_res(struct iser_reg_resources *rsc)
356{
357	ib_dereg_mr(rsc->mr);
358	ib_free_fast_reg_page_list(rsc->frpl);
359}
360
361static struct fast_reg_descriptor *
362iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd)
363{
364	struct fast_reg_descriptor *desc;
365	int ret;
366
367	desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO);
368	if (!desc) {
369		ISER_ERR("Failed to allocate a new fastreg descriptor");
370		return (NULL);
371	}
372
373	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
374	if (ret) {
375		ISER_ERR("failed to allocate reg_resources");
376		goto err;
377	}
378
379	return (desc);
380err:
381	free(desc, M_ISER_VERBS);
382	return (NULL);
383}
384
385/**
386 * iser_create_fmr_pool - Creates FMR pool and page_vector
387 *
388 * returns 0 on success, or errno code on failure
389 */
390int
391iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
392{
393	struct iser_device *device = ib_conn->device;
394	struct fast_reg_descriptor *desc;
395	int i;
396
397	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
398	ib_conn->fastreg.pool_size = 0;
399	for (i = 0; i < cmds_max; i++) {
400		desc = iser_create_fastreg_desc(device->ib_device, device->pd);
401		if (!desc) {
402			ISER_ERR("Failed to create fastreg descriptor");
403			goto err;
404		}
405
406		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
407		ib_conn->fastreg.pool_size++;
408	}
409
410	return (0);
411
412err:
413	iser_free_fastreg_pool(ib_conn);
414	return (ENOMEM);
415}
416
417/**
418 * iser_free_fmr_pool - releases the FMR pool and page vec
419 */
420void
421iser_free_fastreg_pool(struct ib_conn *ib_conn)
422{
423	struct fast_reg_descriptor *desc, *tmp;
424	int i = 0;
425
426	if (list_empty(&ib_conn->fastreg.pool))
427		return;
428
429	ISER_DBG("freeing conn %p fr pool", ib_conn);
430
431	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
432		list_del(&desc->list);
433		iser_free_reg_res(&desc->rsc);
434		free(desc, M_ISER_VERBS);
435		++i;
436	}
437
438	if (i < ib_conn->fastreg.pool_size)
439		ISER_WARN("pool still has %d regions registered",
440			  ib_conn->fastreg.pool_size - i);
441}
442
443/**
444 * iser_create_ib_conn_res - Queue-Pair (QP)
445 *
446 * returns 0 on success, 1 on failure
447 */
448static int
449iser_create_ib_conn_res(struct ib_conn *ib_conn)
450{
451	struct iser_conn *iser_conn;
452	struct iser_device *device;
453	struct ib_device_attr *dev_attr;
454	struct ib_qp_init_attr init_attr;
455	int index, min_index = 0;
456	int ret = -ENOMEM;
457
458	iser_conn = container_of(ib_conn, struct iser_conn, ib_conn);
459	device = ib_conn->device;
460	dev_attr = &device->dev_attr;
461
462	mtx_lock(&ig.connlist_mutex);
463	/* select the CQ with the minimal number of usages */
464	for (index = 0; index < device->comps_used; index++) {
465		if (device->comps[index].active_qps <
466		    device->comps[min_index].active_qps)
467			min_index = index;
468	}
469	ib_conn->comp = &device->comps[min_index];
470	ib_conn->comp->active_qps++;
471	mtx_unlock(&ig.connlist_mutex);
472	ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn);
473
474	memset(&init_attr, 0, sizeof init_attr);
475	init_attr.event_handler = iser_qp_event_callback;
476	init_attr.qp_context	= (void *)ib_conn;
477	init_attr.send_cq	= ib_conn->comp->cq;
478	init_attr.recv_cq	= ib_conn->comp->cq;
479	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
480	init_attr.cap.max_send_sge = 2;
481	init_attr.cap.max_recv_sge = 1;
482	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
483	init_attr.qp_type	= IB_QPT_RC;
484
485	if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
486		init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
487		iser_conn->max_cmds =
488			ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
489	} else {
490		init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
491		iser_conn->max_cmds =
492			ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
493	}
494	ISER_DBG("device %s supports max_send_wr %d",
495	         device->ib_device->name, dev_attr->max_qp_wr);
496
497	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
498	if (ret)
499		goto out_err;
500
501	ib_conn->qp = ib_conn->cma_id->qp;
502	ISER_DBG("setting conn %p cma_id %p qp %p",
503		 ib_conn, ib_conn->cma_id,
504		 ib_conn->cma_id->qp);
505
506	return (ret);
507
508out_err:
509	mtx_lock(&ig.connlist_mutex);
510	ib_conn->comp->active_qps--;
511	mtx_unlock(&ig.connlist_mutex);
512	ISER_ERR("unable to alloc mem or create resource, err %d", ret);
513
514	return (ret);
515}
516
517/**
518 * based on the resolved device node GUID see if there already allocated
519 * device for this device. If there's no such, create one.
520 */
521static struct iser_device *
522iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
523{
524	struct iser_device *device;
525
526	sx_xlock(&ig.device_list_mutex);
527
528	list_for_each_entry(device, &ig.device_list, ig_list)
529		/* find if there's a match using the node GUID */
530		if (device->ib_device->node_guid == cma_id->device->node_guid)
531			goto inc_refcnt;
532
533	device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO);
534	if (device == NULL)
535		goto out;
536
537	/* assign this device to the device */
538	device->ib_device = cma_id->device;
539	/* init the device and link it into ig device list */
540	if (iser_create_device_ib_res(device)) {
541		free(device, M_ISER_VERBS);
542		device = NULL;
543		goto out;
544	}
545	list_add(&device->ig_list, &ig.device_list);
546
547inc_refcnt:
548	device->refcount++;
549	ISER_INFO("device %p refcount %d", device, device->refcount);
550out:
551	sx_xunlock(&ig.device_list_mutex);
552	return (device);
553}
554
555/* if there's no demand for this device, release it */
556static void
557iser_device_try_release(struct iser_device *device)
558{
559	sx_xlock(&ig.device_list_mutex);
560	device->refcount--;
561	ISER_INFO("device %p refcount %d", device, device->refcount);
562	if (!device->refcount) {
563		iser_free_device_ib_res(device);
564		list_del(&device->ig_list);
565		free(device, M_ISER_VERBS);
566		device = NULL;
567	}
568	sx_xunlock(&ig.device_list_mutex);
569}
570
571/**
572 * Called with state mutex held
573 **/
574static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
575				     enum iser_conn_state comp,
576				     enum iser_conn_state exch)
577{
578	int ret;
579
580	ret = (iser_conn->state == comp);
581	if (ret)
582		iser_conn->state = exch;
583
584	return ret;
585}
586
587/**
588 * iser_free_ib_conn_res - release IB related resources
589 * @iser_conn: iser connection struct
590 * @destroy: indicator if we need to try to release the
591 *     iser device and memory regoins pool (only iscsi
592 *     shutdown and DEVICE_REMOVAL will use this).
593 *
594 * This routine is called with the iser state mutex held
595 * so the cm_id removal is out of here. It is Safe to
596 * be invoked multiple times.
597 */
598void
599iser_free_ib_conn_res(struct iser_conn *iser_conn,
600				  bool destroy)
601{
602	struct ib_conn *ib_conn = &iser_conn->ib_conn;
603	struct iser_device *device = ib_conn->device;
604
605	ISER_INFO("freeing conn %p cma_id %p qp %p",
606		  iser_conn, ib_conn->cma_id, ib_conn->qp);
607
608	if (ib_conn->qp != NULL) {
609		mtx_lock(&ig.connlist_mutex);
610		ib_conn->comp->active_qps--;
611		mtx_unlock(&ig.connlist_mutex);
612		rdma_destroy_qp(ib_conn->cma_id);
613		ib_conn->qp = NULL;
614	}
615
616	if (destroy) {
617		if (iser_conn->login_buf)
618			iser_free_login_buf(iser_conn);
619
620		if (iser_conn->rx_descs)
621			iser_free_rx_descriptors(iser_conn);
622
623		if (device != NULL) {
624			iser_device_try_release(device);
625			ib_conn->device = NULL;
626		}
627	}
628}
629
630/**
631 * triggers start of the disconnect procedures and wait for them to be done
632 * Called with state mutex held
633 */
634int
635iser_conn_terminate(struct iser_conn *iser_conn)
636{
637	struct ib_conn *ib_conn = &iser_conn->ib_conn;
638	struct ib_send_wr *bad_send_wr;
639	struct ib_recv_wr *bad_recv_wr;
640	int err = 0;
641
642	/* terminate the iser conn only if the conn state is UP */
643	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
644					   ISER_CONN_TERMINATING))
645		return (0);
646
647	ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state);
648
649	if (ib_conn->qp == NULL) {
650		/* HOW can this be??? */
651		ISER_WARN("qp wasn't created");
652		return (1);
653	}
654
655	/*
656	 * Todo: This is a temporary workaround.
657	 * We serialize the connection closure using global lock in order to
658	 * receive all posted beacons completions.
659	 * Without Serialization, in case we open many connections (QPs) on
660	 * the same CQ, we might miss beacons because of missing interrupts.
661	 */
662	sx_xlock(&ig.close_conns_mutex);
663
664	/*
665	 * In case we didn't already clean up the cma_id (peer initiated
666	 * a disconnection), we need to Cause the CMA to change the QP
667	 * state to ERROR.
668	 */
669	if (ib_conn->cma_id) {
670		err = rdma_disconnect(ib_conn->cma_id);
671		if (err)
672			ISER_ERR("Failed to disconnect, conn: 0x%p err %d",
673				iser_conn, err);
674
675		mtx_lock(&ib_conn->beacon.flush_lock);
676		memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr));
677		ib_conn->beacon.send.wr_id = ISER_BEACON_WRID;
678		ib_conn->beacon.send.opcode = IB_WR_SEND;
679		/* post an indication that all send flush errors were consumed */
680		err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr);
681		if (err) {
682			ISER_ERR("conn %p failed to post send_beacon", ib_conn);
683			mtx_unlock(&ib_conn->beacon.flush_lock);
684			goto out;
685		}
686
687		ISER_DBG("before send cv_wait: %p", iser_conn);
688		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
689		ISER_DBG("after send cv_wait: %p", iser_conn);
690
691		memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr));
692		ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID;
693		/* post an indication that all recv flush errors were consumed */
694		err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr);
695		if (err) {
696			ISER_ERR("conn %p failed to post recv_beacon", ib_conn);
697			mtx_unlock(&ib_conn->beacon.flush_lock);
698			goto out;
699		}
700
701		ISER_DBG("before recv cv_wait: %p", iser_conn);
702		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
703		mtx_unlock(&ib_conn->beacon.flush_lock);
704		ISER_DBG("after recv cv_wait: %p", iser_conn);
705	}
706out:
707	sx_xunlock(&ig.close_conns_mutex);
708	return (1);
709}
710
711/**
712 * Called with state mutex held
713 **/
714static void
715iser_connect_error(struct rdma_cm_id *cma_id)
716{
717	struct iser_conn *iser_conn;
718
719	iser_conn = cma_id->context;
720
721	ISER_ERR("conn %p", iser_conn);
722
723	iser_conn->state = ISER_CONN_TERMINATING;
724
725	cv_signal(&iser_conn->up_cv);
726}
727
728/**
729 * Called with state mutex held
730 **/
731static void
732iser_addr_handler(struct rdma_cm_id *cma_id)
733{
734	struct iser_device *device;
735	struct iser_conn   *iser_conn;
736	struct ib_conn   *ib_conn;
737	int    ret;
738
739	iser_conn = cma_id->context;
740
741	ib_conn = &iser_conn->ib_conn;
742	device = iser_device_find_by_ib_device(cma_id);
743	if (!device) {
744		ISER_ERR("conn %p device lookup/creation failed",
745			 iser_conn);
746		iser_connect_error(cma_id);
747		return;
748	}
749
750	ib_conn->device = device;
751
752	ret = rdma_resolve_route(cma_id, 1000);
753	if (ret) {
754		ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret);
755		iser_connect_error(cma_id);
756		return;
757	}
758}
759
760/**
761 * Called with state mutex held
762 **/
763static void
764iser_route_handler(struct rdma_cm_id *cma_id)
765{
766	struct rdma_conn_param conn_param;
767	int    ret;
768	struct iser_cm_hdr req_hdr;
769	struct iser_conn *iser_conn = cma_id->context;
770	struct ib_conn *ib_conn = &iser_conn->ib_conn;
771	struct iser_device *device = ib_conn->device;
772
773	ret = iser_create_ib_conn_res(ib_conn);
774	if (ret)
775		goto failure;
776
777	memset(&conn_param, 0, sizeof conn_param);
778	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
779	conn_param.retry_count	       = 7;
780	conn_param.rnr_retry_count     = 6;
781	/*
782	 * Initiaotr depth should not be set, but in order to compat
783	 * with old targets, we keep this value set.
784	 */
785	conn_param.initiator_depth     = 1;
786
787	memset(&req_hdr, 0, sizeof(req_hdr));
788	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
789			ISER_SEND_W_INV_NOT_SUPPORTED);
790	conn_param.private_data		= (void *)&req_hdr;
791	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
792
793	ret = rdma_connect(cma_id, &conn_param);
794	if (ret) {
795		ISER_ERR("conn %p failure connecting: %d", iser_conn, ret);
796		goto failure;
797	}
798
799	return;
800failure:
801	iser_connect_error(cma_id);
802}
803
804/**
805 * Called with state mutex held
806 **/
807static void
808iser_connected_handler(struct rdma_cm_id *cma_id)
809{
810	struct iser_conn *iser_conn;
811	struct ib_qp_attr attr;
812	struct ib_qp_init_attr init_attr;
813
814	iser_conn = cma_id->context;
815
816	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
817
818	ISER_INFO("remote qpn:%x my qpn:%x",
819		  attr.dest_qp_num, cma_id->qp->qp_num);
820
821	iser_conn->state = ISER_CONN_UP;
822
823	cv_signal(&iser_conn->up_cv);
824}
825
826/**
827 * Called with state mutex held
828 **/
829static void
830iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy)
831{
832	struct iser_conn *iser_conn = cma_id->context;
833
834	if (iser_conn_terminate(iser_conn))
835		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
836
837}
838
839int
840iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
841{
842	struct iser_conn *iser_conn;
843	int ret = 0;
844
845	iser_conn = cma_id->context;
846	ISER_INFO("event %d status %d conn %p id %p",
847		  event->event, event->status, cma_id->context, cma_id);
848
849	sx_xlock(&iser_conn->state_mutex);
850	switch (event->event) {
851	case RDMA_CM_EVENT_ADDR_RESOLVED:
852		iser_addr_handler(cma_id);
853		break;
854	case RDMA_CM_EVENT_ROUTE_RESOLVED:
855		iser_route_handler(cma_id);
856		break;
857	case RDMA_CM_EVENT_ESTABLISHED:
858		iser_connected_handler(cma_id);
859		break;
860	case RDMA_CM_EVENT_ADDR_ERROR:
861	case RDMA_CM_EVENT_ROUTE_ERROR:
862	case RDMA_CM_EVENT_CONNECT_ERROR:
863	case RDMA_CM_EVENT_UNREACHABLE:
864	case RDMA_CM_EVENT_REJECTED:
865		iser_connect_error(cma_id);
866		break;
867	case RDMA_CM_EVENT_DISCONNECTED:
868	case RDMA_CM_EVENT_ADDR_CHANGE:
869	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
870		iser_cleanup_handler(cma_id, false);
871		break;
872	default:
873		ISER_ERR("Unexpected RDMA CM event (%d)", event->event);
874		break;
875	}
876	sx_xunlock(&iser_conn->state_mutex);
877
878	return (ret);
879}
880
881int
882iser_post_recvl(struct iser_conn *iser_conn)
883{
884	struct ib_recv_wr rx_wr, *rx_wr_failed;
885	struct ib_conn *ib_conn = &iser_conn->ib_conn;
886	struct ib_sge	  sge;
887	int ib_ret;
888
889	sge.addr   = iser_conn->login_resp_dma;
890	sge.length = ISER_RX_LOGIN_SIZE;
891	sge.lkey   = ib_conn->device->mr->lkey;
892
893	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
894	rx_wr.sg_list = &sge;
895	rx_wr.num_sge = 1;
896	rx_wr.next    = NULL;
897
898	ib_conn->post_recv_buf_count++;
899	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
900	if (ib_ret) {
901		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
902		ib_conn->post_recv_buf_count--;
903	}
904
905	return (ib_ret);
906}
907
908int
909iser_post_recvm(struct iser_conn *iser_conn, int count)
910{
911	struct ib_recv_wr *rx_wr, *rx_wr_failed;
912	int i, ib_ret;
913	struct ib_conn *ib_conn = &iser_conn->ib_conn;
914	unsigned int my_rx_head = iser_conn->rx_desc_head;
915	struct iser_rx_desc *rx_desc;
916
917	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
918		rx_desc		= &iser_conn->rx_descs[my_rx_head];
919		rx_wr->wr_id	= (uintptr_t)rx_desc;
920		rx_wr->sg_list	= &rx_desc->rx_sg;
921		rx_wr->num_sge	= 1;
922		rx_wr->next	= rx_wr + 1;
923		my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos;
924	}
925
926	rx_wr--;
927	rx_wr->next = NULL; /* mark end of work requests list */
928
929	ib_conn->post_recv_buf_count += count;
930	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
931	if (ib_ret) {
932		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
933		ib_conn->post_recv_buf_count -= count;
934	} else
935		iser_conn->rx_desc_head = my_rx_head;
936
937	return (ib_ret);
938}
939
940/**
941 * iser_start_send - Initiate a Send DTO operation
942 *
943 * returns 0 on success, -1 on failure
944 */
945int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
946		   bool signal)
947{
948	int		  ib_ret;
949	struct ib_send_wr send_wr, *send_wr_failed;
950
951	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
952				      tx_desc->dma_addr, ISER_HEADERS_LEN,
953				      DMA_TO_DEVICE);
954
955	send_wr.next	   = NULL;
956	send_wr.wr_id	   = (uintptr_t)tx_desc;
957	send_wr.sg_list	   = tx_desc->tx_sg;
958	send_wr.num_sge	   = tx_desc->num_sge;
959	send_wr.opcode	   = IB_WR_SEND;
960	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
961
962	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
963	if (ib_ret)
964		ISER_ERR("ib_post_send failed, ret:%d", ib_ret);
965
966	return (ib_ret);
967}
968