1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Chelsio T5xx iSCSI driver
6 *
7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/types.h>
38#include <sys/param.h>
39#include <sys/kernel.h>
40#include <sys/module.h>
41#include <sys/systm.h>
42
43#ifdef TCP_OFFLOAD
44#include <sys/errno.h>
45#include <sys/kthread.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/mbuf.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/condvar.h>
53
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/toecore.h>
57#include <netinet/tcp_var.h>
58#include <netinet/tcp_fsm.h>
59
60#include <cam/scsi/scsi_all.h>
61#include <cam/scsi/scsi_da.h>
62#include <cam/ctl/ctl_io.h>
63#include <cam/ctl/ctl.h>
64#include <cam/ctl/ctl_backend.h>
65#include <cam/ctl/ctl_error.h>
66#include <cam/ctl/ctl_frontend.h>
67#include <cam/ctl/ctl_debug.h>
68#include <cam/ctl/ctl_ha.h>
69#include <cam/ctl/ctl_ioctl.h>
70
71#include <dev/iscsi/icl.h>
72#include <dev/iscsi/iscsi_proto.h>
73#include <dev/iscsi/iscsi_ioctl.h>
74#include <dev/iscsi/iscsi.h>
75#include <cam/ctl/ctl_frontend_iscsi.h>
76
77#include <cam/cam.h>
78#include <cam/cam_ccb.h>
79#include <cam/cam_xpt.h>
80#include <cam/cam_debug.h>
81#include <cam/cam_sim.h>
82#include <cam/cam_xpt_sim.h>
83#include <cam/cam_xpt_periph.h>
84#include <cam/cam_periph.h>
85#include <cam/cam_compat.h>
86#include <cam/scsi/scsi_message.h>
87
88#include "common/common.h"
89#include "common/t4_msg.h"
90#include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
91#include "tom/t4_tom.h"
92#include "cxgbei.h"
93
94static int worker_thread_count;
95static struct cxgbei_worker_thread_softc *cwt_softc;
96static struct proc *cxgbei_proc;
97
98/* XXXNP some header instead. */
99struct icl_pdu *icl_cxgbei_new_pdu(int);
100void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
101void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
102
103static void
104free_ci_counters(struct cxgbei_data *ci)
105{
106
107#define FREE_CI_COUNTER(x) do { \
108	if (ci->x != NULL) { \
109		counter_u64_free(ci->x); \
110		ci->x = NULL; \
111	} \
112} while (0)
113
114	FREE_CI_COUNTER(ddp_setup_ok);
115	FREE_CI_COUNTER(ddp_setup_error);
116	FREE_CI_COUNTER(ddp_bytes);
117	FREE_CI_COUNTER(ddp_pdus);
118	FREE_CI_COUNTER(fl_bytes);
119	FREE_CI_COUNTER(fl_pdus);
120#undef FREE_CI_COUNTER
121}
122
123static int
124alloc_ci_counters(struct cxgbei_data *ci)
125{
126
127#define ALLOC_CI_COUNTER(x) do { \
128	ci->x = counter_u64_alloc(M_WAITOK); \
129	if (ci->x == NULL) \
130		goto fail; \
131} while (0)
132
133	ALLOC_CI_COUNTER(ddp_setup_ok);
134	ALLOC_CI_COUNTER(ddp_setup_error);
135	ALLOC_CI_COUNTER(ddp_bytes);
136	ALLOC_CI_COUNTER(ddp_pdus);
137	ALLOC_CI_COUNTER(fl_bytes);
138	ALLOC_CI_COUNTER(fl_pdus);
139#undef ALLOC_CI_COUNTER
140
141	return (0);
142fail:
143	free_ci_counters(ci);
144	return (ENOMEM);
145}
146
147static void
148read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len,
149    uint32_t *max_rx_pdu_len)
150{
151	uint32_t tx_len, rx_len, r, v;
152
153	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
154	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
155
156	r = t4_read_reg(sc, A_TP_PARA_REG2);
157	rx_len = min(rx_len, G_MAXRXDATA(r));
158	tx_len = min(tx_len, G_MAXRXDATA(r));
159
160	r = t4_read_reg(sc, A_TP_PARA_REG7);
161	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
162	rx_len = min(rx_len, v);
163	tx_len = min(tx_len, v);
164
165	/* Remove after FW_FLOWC_MNEM_TXDATAPLEN_MAX fix in firmware. */
166	tx_len = min(tx_len, 3 * 4096);
167
168	*max_tx_pdu_len = rounddown2(tx_len, 512);
169	*max_rx_pdu_len = rounddown2(rx_len, 512);
170}
171
172/*
173 * Initialize the software state of the iSCSI ULP driver.
174 *
175 * ENXIO means firmware didn't set up something that it was supposed to.
176 */
177static int
178cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
179{
180	struct sysctl_oid *oid;
181	struct sysctl_oid_list *children;
182	struct ppod_region *pr;
183	uint32_t r;
184	int rc;
185
186	MPASS(sc->vres.iscsi.size > 0);
187	MPASS(ci != NULL);
188
189	rc = alloc_ci_counters(ci);
190	if (rc != 0)
191		return (rc);
192
193	read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len);
194
195	pr = &ci->pr;
196	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
197	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
198	if (rc != 0) {
199		device_printf(sc->dev,
200		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
201		    __func__, rc);
202		free_ci_counters(ci);
203		return (rc);
204	}
205
206	r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK);
207	r &= V_ISCSITAGMASK(M_ISCSITAGMASK);
208	if (r != pr->pr_tag_mask) {
209		/*
210		 * Recent firmwares are supposed to set up the iSCSI tagmask
211		 * but we'll do it ourselves it the computed value doesn't match
212		 * what's in the register.
213		 */
214		device_printf(sc->dev,
215		    "tagmask 0x%08x does not match computed mask 0x%08x.\n", r,
216		    pr->pr_tag_mask);
217		t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK,
218		    V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask);
219	}
220
221	sysctl_ctx_init(&ci->ctx);
222	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
223	children = SYSCTL_CHILDREN(oid);
224
225	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD,
226	    NULL, "iSCSI ULP statistics");
227	children = SYSCTL_CHILDREN(oid);
228
229	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok",
230	    CTLFLAG_RD, &ci->ddp_setup_ok,
231	    "# of times DDP buffer was setup successfully.");
232
233	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error",
234	    CTLFLAG_RD, &ci->ddp_setup_error,
235	    "# of times DDP buffer setup failed.");
236
237	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes",
238	    CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly");
239
240	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus",
241	    CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly.");
242
243	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes",
244	    CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist");
245
246	SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus",
247	    CTLFLAG_RD, &ci->fl_pdus,
248	    "# of PDUs with data delivered in freelist");
249
250	ci->ddp_threshold = 2048;
251	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
252	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
253
254	return (0);
255}
256
257static int
258do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
259{
260	struct adapter *sc = iq->adapter;
261	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
262	u_int tid = GET_TID(cpl);
263	struct toepcb *toep = lookup_tid(sc, tid);
264	struct icl_pdu *ip;
265	struct icl_cxgbei_pdu *icp;
266	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
267	uint16_t len = be16toh(cpl->len);
268
269	M_ASSERTPKTHDR(m);
270	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
271
272	ip = icl_cxgbei_new_pdu(M_NOWAIT);
273	if (ip == NULL)
274		CXGBE_UNIMPLEMENTED("PDU allocation failure");
275	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
276	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
277	icp = ip_to_icp(ip);
278	icp->icp_seq = ntohl(cpl->seq);
279	icp->icp_flags = ICPF_RX_HDR;
280
281	/* This is the start of a new PDU.  There should be no old state. */
282	MPASS(toep->ulpcb2 == NULL);
283	toep->ulpcb2 = icp;
284
285#if 0
286	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
287	    __func__, tid, len, len_ddp, icp);
288#endif
289
290	m_freem(m);
291	return (0);
292}
293
294static int
295do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
296{
297	struct adapter *sc = iq->adapter;
298	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
299	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
300	u_int tid = GET_TID(cpl);
301	struct toepcb *toep = lookup_tid(sc, tid);
302	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
303
304	M_ASSERTPKTHDR(m);
305	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
306
307	/* Must already have received the header (but not the data). */
308	MPASS(icp != NULL);
309	MPASS(icp->icp_flags == ICPF_RX_HDR);
310	MPASS(icp->ip.ip_data_mbuf == NULL);
311
312
313	m_adj(m, sizeof(*cpl));
314	MPASS(icp->ip.ip_data_len == m->m_pkthdr.len);
315
316	icp->icp_flags |= ICPF_RX_FLBUF;
317	icp->ip.ip_data_mbuf = m;
318	counter_u64_add(ci->fl_pdus, 1);
319	counter_u64_add(ci->fl_bytes, m->m_pkthdr.len);
320
321#if 0
322	CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid,
323	    be16toh(cpl->len));
324#endif
325
326	return (0);
327}
328
329static int
330do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
331{
332	struct adapter *sc = iq->adapter;
333	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
334	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
335	u_int tid = GET_TID(cpl);
336	struct toepcb *toep = lookup_tid(sc, tid);
337	struct inpcb *inp = toep->inp;
338	struct socket *so;
339	struct sockbuf *sb;
340	struct tcpcb *tp;
341	struct icl_cxgbei_conn *icc;
342	struct icl_conn *ic;
343	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
344	struct icl_pdu *ip;
345	u_int pdu_len, val;
346	struct epoch_tracker et;
347
348	MPASS(m == NULL);
349
350	/* Must already be assembling a PDU. */
351	MPASS(icp != NULL);
352	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
353	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
354
355	pdu_len = be16toh(cpl->len);	/* includes everything. */
356	val = be32toh(cpl->ddpvld);
357
358#if 0
359	CTR5(KTR_CXGBE,
360	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
361	    __func__, tid, pdu_len, val, icp->icp_flags);
362#endif
363
364	icp->icp_flags |= ICPF_RX_STATUS;
365	ip = &icp->ip;
366	if (val & F_DDP_PADDING_ERR)
367		icp->icp_flags |= ICPF_PAD_ERR;
368	if (val & F_DDP_HDRCRC_ERR)
369		icp->icp_flags |= ICPF_HCRC_ERR;
370	if (val & F_DDP_DATACRC_ERR)
371		icp->icp_flags |= ICPF_DCRC_ERR;
372	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
373		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
374		MPASS(ip->ip_data_len > 0);
375		icp->icp_flags |= ICPF_RX_DDP;
376		counter_u64_add(ci->ddp_pdus, 1);
377		counter_u64_add(ci->ddp_bytes, ip->ip_data_len);
378	}
379
380	INP_WLOCK(inp);
381	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
382		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
383		    __func__, tid, pdu_len, inp->inp_flags);
384		INP_WUNLOCK(inp);
385		icl_cxgbei_conn_pdu_free(NULL, ip);
386#ifdef INVARIANTS
387		toep->ulpcb2 = NULL;
388#endif
389		return (0);
390	}
391
392	tp = intotcpcb(inp);
393	MPASS(icp->icp_seq == tp->rcv_nxt);
394	MPASS(tp->rcv_wnd >= pdu_len);
395	tp->rcv_nxt += pdu_len;
396	tp->rcv_wnd -= pdu_len;
397	tp->t_rcvtime = ticks;
398
399	/* update rx credits */
400	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
401
402	so = inp->inp_socket;
403	sb = &so->so_rcv;
404	SOCKBUF_LOCK(sb);
405
406	icc = toep->ulpcb;
407	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
408		CTR5(KTR_CXGBE,
409		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
410		    __func__, tid, pdu_len, icc, sb->sb_state);
411		SOCKBUF_UNLOCK(sb);
412		INP_WUNLOCK(inp);
413
414		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
415		INP_WLOCK(inp);
416		tp = tcp_drop(tp, ECONNRESET);
417		if (tp)
418			INP_WUNLOCK(inp);
419		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
420
421		icl_cxgbei_conn_pdu_free(NULL, ip);
422#ifdef INVARIANTS
423		toep->ulpcb2 = NULL;
424#endif
425		return (0);
426	}
427	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
428	ic = &icc->ic;
429	icl_cxgbei_new_pdu_set_conn(ip, ic);
430
431	MPASS(m == NULL); /* was unused, we'll use it now. */
432	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
433	if (__predict_false(m != NULL)) {
434		int len = m_length(m, NULL);
435
436		/*
437		 * PDUs were received before the tid transitioned to ULP mode.
438		 * Convert them to icl_cxgbei_pdus and send them to ICL before
439		 * the PDU in icp/ip.
440		 */
441		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
442		    len);
443
444		/* XXXNP: needs to be rewritten. */
445		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
446		    iscsi_bhs)) {
447			struct icl_cxgbei_pdu *icp0;
448			struct icl_pdu *ip0;
449
450			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
451			if (ip0 == NULL)
452				CXGBE_UNIMPLEMENTED("PDU allocation failure");
453			icl_cxgbei_new_pdu_set_conn(ip0, ic);
454			icp0 = ip_to_icp(ip0);
455			icp0->icp_seq = 0; /* XXX */
456			icp0->icp_flags = ICPF_RX_HDR | ICPF_RX_STATUS;
457			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
458			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
459		}
460		m_freem(m);
461	}
462
463	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
464	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
465		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
466
467		mtx_lock(&cwt->cwt_lock);
468		icc->rx_flags |= RXF_ACTIVE;
469		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
470		if (cwt->cwt_state == CWT_SLEEPING) {
471			cwt->cwt_state = CWT_RUNNING;
472			cv_signal(&cwt->cwt_cv);
473		}
474		mtx_unlock(&cwt->cwt_lock);
475	}
476	SOCKBUF_UNLOCK(sb);
477	INP_WUNLOCK(inp);
478
479#ifdef INVARIANTS
480	toep->ulpcb2 = NULL;
481#endif
482
483	return (0);
484}
485
486static int
487cxgbei_activate(struct adapter *sc)
488{
489	struct cxgbei_data *ci;
490	int rc;
491
492	ASSERT_SYNCHRONIZED_OP(sc);
493
494	if (uld_active(sc, ULD_ISCSI)) {
495		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
496		    __func__, sc));
497		return (0);
498	}
499
500	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
501		device_printf(sc->dev,
502		    "not iSCSI offload capable, or capability disabled.\n");
503		return (ENOSYS);
504	}
505
506	/* per-adapter softc for iSCSI */
507	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
508	if (ci == NULL)
509		return (ENOMEM);
510
511	rc = cxgbei_init(sc, ci);
512	if (rc != 0) {
513		free(ci, M_CXGBE);
514		return (rc);
515	}
516
517	sc->iscsi_ulp_softc = ci;
518
519	return (0);
520}
521
522static int
523cxgbei_deactivate(struct adapter *sc)
524{
525	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
526
527	ASSERT_SYNCHRONIZED_OP(sc);
528
529	if (ci != NULL) {
530		sysctl_ctx_free(&ci->ctx);
531		t4_free_ppod_region(&ci->pr);
532		free_ci_counters(ci);
533		free(ci, M_CXGBE);
534		sc->iscsi_ulp_softc = NULL;
535	}
536
537	return (0);
538}
539
540static void
541cxgbei_activate_all(struct adapter *sc, void *arg __unused)
542{
543
544	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
545		return;
546
547	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
548	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
549		(void) t4_activate_uld(sc, ULD_ISCSI);
550
551	end_synchronized_op(sc, 0);
552}
553
554static void
555cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
556{
557
558	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
559		return;
560
561	if (uld_active(sc, ULD_ISCSI))
562	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
563
564	end_synchronized_op(sc, 0);
565}
566
567static struct uld_info cxgbei_uld_info = {
568	.uld_id = ULD_ISCSI,
569	.activate = cxgbei_activate,
570	.deactivate = cxgbei_deactivate,
571};
572
573static void
574cwt_main(void *arg)
575{
576	struct cxgbei_worker_thread_softc *cwt = arg;
577	struct icl_cxgbei_conn *icc = NULL;
578	struct icl_conn *ic;
579	struct icl_pdu *ip;
580	struct sockbuf *sb;
581	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
582
583	MPASS(cwt != NULL);
584
585	mtx_lock(&cwt->cwt_lock);
586	MPASS(cwt->cwt_state == 0);
587	cwt->cwt_state = CWT_RUNNING;
588	cv_signal(&cwt->cwt_cv);
589
590	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
591		cwt->cwt_state = CWT_RUNNING;
592		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
593			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
594			mtx_unlock(&cwt->cwt_lock);
595
596			ic = &icc->ic;
597			sb = &ic->ic_socket->so_rcv;
598
599			SOCKBUF_LOCK(sb);
600			MPASS(icc->rx_flags & RXF_ACTIVE);
601			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
602				MPASS(STAILQ_EMPTY(&rx_pdus));
603				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
604				SOCKBUF_UNLOCK(sb);
605
606				/* Hand over PDUs to ICL. */
607				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
608					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
609					ic->ic_receive(ip);
610				}
611
612				SOCKBUF_LOCK(sb);
613				MPASS(STAILQ_EMPTY(&rx_pdus));
614			}
615			MPASS(icc->rx_flags & RXF_ACTIVE);
616			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
617			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
618				icc->rx_flags &= ~RXF_ACTIVE;
619			} else {
620				/*
621				 * More PDUs were received while we were busy
622				 * handing over the previous batch to ICL.
623				 * Re-add this connection to the end of the
624				 * queue.
625				 */
626				mtx_lock(&cwt->cwt_lock);
627				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
628				    rx_link);
629				mtx_unlock(&cwt->cwt_lock);
630			}
631			SOCKBUF_UNLOCK(sb);
632
633			mtx_lock(&cwt->cwt_lock);
634		}
635
636		/* Inner loop doesn't check for CWT_STOP, do that first. */
637		if (__predict_false(cwt->cwt_state == CWT_STOP))
638			break;
639		cwt->cwt_state = CWT_SLEEPING;
640		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
641	}
642
643	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
644	mtx_assert(&cwt->cwt_lock, MA_OWNED);
645	cwt->cwt_state = CWT_STOPPED;
646	cv_signal(&cwt->cwt_cv);
647	mtx_unlock(&cwt->cwt_lock);
648	kthread_exit();
649}
650
651static int
652start_worker_threads(void)
653{
654	int i, rc;
655	struct cxgbei_worker_thread_softc *cwt;
656
657	worker_thread_count = min(mp_ncpus, 32);
658	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
659	    M_WAITOK | M_ZERO);
660
661	MPASS(cxgbei_proc == NULL);
662	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
663		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
664		cv_init(&cwt->cwt_cv, "cwt cv");
665		TAILQ_INIT(&cwt->rx_head);
666		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
667		    "cxgbei", "%d", i);
668		if (rc != 0) {
669			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
670			    i + 1, worker_thread_count, rc);
671			mtx_destroy(&cwt->cwt_lock);
672			cv_destroy(&cwt->cwt_cv);
673			bzero(cwt, sizeof(*cwt));
674			if (i == 0) {
675				free(cwt_softc, M_CXGBE);
676				worker_thread_count = 0;
677
678				return (rc);
679			}
680
681			/* Not fatal, carry on with fewer threads. */
682			worker_thread_count = i;
683			rc = 0;
684			break;
685		}
686
687		/* Wait for thread to start before moving on to the next one. */
688		mtx_lock(&cwt->cwt_lock);
689		while (cwt->cwt_state == 0)
690			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
691		mtx_unlock(&cwt->cwt_lock);
692	}
693
694	MPASS(cwt_softc != NULL);
695	MPASS(worker_thread_count > 0);
696	return (0);
697}
698
699static void
700stop_worker_threads(void)
701{
702	int i;
703	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
704
705	MPASS(worker_thread_count >= 0);
706
707	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
708		mtx_lock(&cwt->cwt_lock);
709		MPASS(cwt->cwt_state == CWT_RUNNING ||
710		    cwt->cwt_state == CWT_SLEEPING);
711		cwt->cwt_state = CWT_STOP;
712		cv_signal(&cwt->cwt_cv);
713		do {
714			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
715		} while (cwt->cwt_state != CWT_STOPPED);
716		mtx_unlock(&cwt->cwt_lock);
717		mtx_destroy(&cwt->cwt_lock);
718		cv_destroy(&cwt->cwt_cv);
719	}
720	free(cwt_softc, M_CXGBE);
721}
722
723/* Select a worker thread for a connection. */
724u_int
725cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
726{
727	struct adapter *sc = icc->sc;
728	struct toepcb *toep = icc->toep;
729	u_int i, n;
730
731	n = worker_thread_count / sc->sge.nofldrxq;
732	if (n > 0)
733		i = toep->vi->pi->port_id * n + arc4random() % n;
734	else
735		i = arc4random() % worker_thread_count;
736
737	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
738
739	return (i);
740}
741
742static int
743cxgbei_mod_load(void)
744{
745	int rc;
746
747	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
748	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
749	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
750
751	rc = start_worker_threads();
752	if (rc != 0)
753		return (rc);
754
755	rc = t4_register_uld(&cxgbei_uld_info);
756	if (rc != 0) {
757		stop_worker_threads();
758		return (rc);
759	}
760
761	t4_iterate(cxgbei_activate_all, NULL);
762
763	return (rc);
764}
765
766static int
767cxgbei_mod_unload(void)
768{
769
770	t4_iterate(cxgbei_deactivate_all, NULL);
771
772	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
773		return (EBUSY);
774
775	stop_worker_threads();
776
777	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
778	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
779	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
780
781	return (0);
782}
783#endif
784
785static int
786cxgbei_modevent(module_t mod, int cmd, void *arg)
787{
788	int rc = 0;
789
790#ifdef TCP_OFFLOAD
791	switch (cmd) {
792	case MOD_LOAD:
793		rc = cxgbei_mod_load();
794		if (rc == 0)
795			rc = icl_cxgbei_mod_load();
796		break;
797
798	case MOD_UNLOAD:
799		rc = icl_cxgbei_mod_unload();
800		if (rc == 0)
801			rc = cxgbei_mod_unload();
802		break;
803
804	default:
805		rc = EINVAL;
806	}
807#else
808	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
809	rc = EOPNOTSUPP;
810#endif
811
812	return (rc);
813}
814
815static moduledata_t cxgbei_mod = {
816	"cxgbei",
817	cxgbei_modevent,
818	NULL,
819};
820
821MODULE_VERSION(cxgbei, 1);
822DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
823MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
824MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
825MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
826