cxgbei.c revision 348704
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Chelsio T5xx iSCSI driver
6 *
7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/cxgbei/cxgbei.c 348704 2019-06-05 21:46:56Z np $");
33
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/types.h>
38#include <sys/param.h>
39#include <sys/kernel.h>
40#include <sys/module.h>
41#include <sys/systm.h>
42
43#ifdef TCP_OFFLOAD
44#include <sys/errno.h>
45#include <sys/kthread.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/mbuf.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/condvar.h>
53
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/toecore.h>
57#include <netinet/tcp_var.h>
58#include <netinet/tcp_fsm.h>
59
60#include <cam/scsi/scsi_all.h>
61#include <cam/scsi/scsi_da.h>
62#include <cam/ctl/ctl_io.h>
63#include <cam/ctl/ctl.h>
64#include <cam/ctl/ctl_backend.h>
65#include <cam/ctl/ctl_error.h>
66#include <cam/ctl/ctl_frontend.h>
67#include <cam/ctl/ctl_debug.h>
68#include <cam/ctl/ctl_ha.h>
69#include <cam/ctl/ctl_ioctl.h>
70
71#include <dev/iscsi/icl.h>
72#include <dev/iscsi/iscsi_proto.h>
73#include <dev/iscsi/iscsi_ioctl.h>
74#include <dev/iscsi/iscsi.h>
75#include <cam/ctl/ctl_frontend_iscsi.h>
76
77#include <cam/cam.h>
78#include <cam/cam_ccb.h>
79#include <cam/cam_xpt.h>
80#include <cam/cam_debug.h>
81#include <cam/cam_sim.h>
82#include <cam/cam_xpt_sim.h>
83#include <cam/cam_xpt_periph.h>
84#include <cam/cam_periph.h>
85#include <cam/cam_compat.h>
86#include <cam/scsi/scsi_message.h>
87
88#include "common/common.h"
89#include "common/t4_msg.h"
90#include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
91#include "tom/t4_tom.h"
92#include "cxgbei.h"
93#include "cxgbei_ulp2_ddp.h"
94
95static int worker_thread_count;
96static struct cxgbei_worker_thread_softc *cwt_softc;
97static struct proc *cxgbei_proc;
98
99/* XXXNP some header instead. */
100struct icl_pdu *icl_cxgbei_new_pdu(int);
101void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
102void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
103
104/*
105 * Direct Data Placement -
106 * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
107 * final destination host-memory buffers based on the Initiator Task Tag (ITT)
108 * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
109 * The host memory address is programmed into h/w in the format of pagepod
110 * entries.
111 * The location of the pagepod entry is encoded into ddp tag which is used as
112 * the base for ITT/TTT.
113 */
114
115/*
116 * functions to program the pagepod in h/w
117 */
118static void inline
119ppod_set(struct pagepod *ppod,
120	struct cxgbei_ulp2_pagepod_hdr *hdr,
121	struct cxgbei_ulp2_gather_list *gl,
122	unsigned int pidx)
123{
124	int i;
125
126	memcpy(ppod, hdr, sizeof(*hdr));
127
128	for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) {
129		ppod->addr[i] = pidx < gl->nelem ?
130			cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL;
131	}
132}
133
134static void inline
135ppod_clear(struct pagepod *ppod)
136{
137	memset(ppod, 0, sizeof(*ppod));
138}
139
140static inline void
141ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req,
142		unsigned int wr_len, unsigned int dlen,
143		unsigned int pm_addr)
144{
145	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
146
147	INIT_ULPTX_WR(req, wr_len, 0, 0);
148	req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
149				V_ULP_MEMIO_ORDER(is_t4(sc)) |
150				V_T5_ULP_MEMIO_IMM(is_t5(sc)));
151	req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5));
152	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)
153				| V_FW_WR_FLOWID(tid));
154	req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5));
155
156	idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM));
157	idata->len = htonl(dlen);
158}
159
160#define ULPMEM_IDATA_MAX_NPPODS 1	/* 256/PPOD_SIZE */
161#define PCIE_MEMWIN_MAX_NPPODS 16	/* 1024/PPOD_SIZE */
162
163static int
164ppod_write_idata(struct cxgbei_data *ci,
165			struct cxgbei_ulp2_pagepod_hdr *hdr,
166			unsigned int idx, unsigned int npods,
167			struct cxgbei_ulp2_gather_list *gl,
168			unsigned int gl_pidx, struct toepcb *toep)
169{
170	u_int dlen = PPOD_SIZE * npods;
171	u_int pm_addr = idx * PPOD_SIZE + ci->llimit;
172	u_int wr_len = roundup(sizeof(struct ulp_mem_io) +
173	    sizeof(struct ulptx_idata) + dlen, 16);
174	struct ulp_mem_io *req;
175	struct ulptx_idata *idata;
176	struct pagepod *ppod;
177	u_int i;
178	struct wrqe *wr;
179	struct adapter *sc = toep->vi->pi->adapter;
180
181	wr = alloc_wrqe(wr_len, toep->ctrlq);
182	if (wr == NULL) {
183		CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure");
184		return (ENOMEM);
185	}
186
187	req = wrtod(wr);
188	memset(req, 0, wr_len);
189	ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr);
190	idata = (struct ulptx_idata *)(req + 1);
191
192	ppod = (struct pagepod *)(idata + 1);
193	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) {
194		if (!hdr) /* clear the pagepod */
195			ppod_clear(ppod);
196		else /* set the pagepod */
197			ppod_set(ppod, hdr, gl, gl_pidx);
198	}
199
200	t4_wrq_tx(sc, wr);
201	return 0;
202}
203
204int
205t4_ddp_set_map(struct cxgbei_data *ci, void *iccp,
206    struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods,
207    struct cxgbei_ulp2_gather_list *gl, int reply)
208{
209	struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp;
210	struct toepcb *toep = icc->toep;
211	int err;
212	unsigned int pidx = 0, w_npods = 0, cnt;
213
214	/*
215	 * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE,
216	 * the order would not be guaranteed, so we will stick with IMMD
217	 */
218	gl->tid = toep->tid;
219	gl->port_id = toep->vi->pi->port_id;
220	gl->egress_dev = (void *)toep->vi->ifp;
221
222	/* send via immediate data */
223	for (; w_npods < npods; idx += cnt, w_npods += cnt,
224		pidx += PPOD_PAGES) {
225		cnt = npods - w_npods;
226		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
227			cnt = ULPMEM_IDATA_MAX_NPPODS;
228		err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep);
229		if (err) {
230			printf("%s: ppod_write_idata failed\n", __func__);
231			break;
232		}
233	}
234	return err;
235}
236
237void
238t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl,
239    u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc)
240{
241	struct toepcb *toep = icc->toep;
242	int err = -1;
243	u_int pidx = 0;
244	u_int w_npods = 0;
245	u_int cnt;
246
247	for (; w_npods < npods; idx += cnt, w_npods += cnt,
248		pidx += PPOD_PAGES) {
249		cnt = npods - w_npods;
250		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
251			cnt = ULPMEM_IDATA_MAX_NPPODS;
252		err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep);
253		if (err)
254			break;
255	}
256}
257
258static int
259cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio)
260{
261	unsigned int data_len = csio->dxfer_len;
262	unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK;
263	unsigned int nsge;
264	unsigned char *sgaddr = csio->data_ptr;
265	unsigned int len = 0;
266
267	nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
268	sgl->sg_addr = sgaddr;
269	sgl->sg_offset = sgoffset;
270	if (data_len <  (PAGE_SIZE - sgoffset))
271		len = data_len;
272	else
273		len = PAGE_SIZE - sgoffset;
274
275	sgl->sg_length = len;
276
277	data_len -= len;
278	sgaddr += len;
279	sgl = sgl+1;
280
281	while (data_len > 0) {
282		sgl->sg_addr = sgaddr;
283		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
284		sgl->sg_length = len;
285	        sgaddr += len;
286		data_len -= len;
287		sgl = sgl + 1;
288	}
289
290	return nsge;
291}
292
293static int
294cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io)
295{
296	unsigned int data_len, sgoffset, nsge;
297	unsigned char *sgaddr;
298	unsigned int len = 0, index = 0, ctl_sg_count, i;
299	struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
300
301	if (io->scsiio.kern_sg_entries > 0) {
302		ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
303		ctl_sg_count = io->scsiio.kern_sg_entries;
304	} else {
305		ctl_sglist = &ctl_sg_entry;
306		ctl_sglist->addr = io->scsiio.kern_data_ptr;
307		ctl_sglist->len = io->scsiio.kern_data_len;
308		ctl_sg_count = 1;
309	}
310
311	sgaddr = sgl->sg_addr = ctl_sglist[index].addr;
312	sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK;
313	data_len = ctl_sglist[index].len;
314
315	if (data_len <  (PAGE_SIZE - sgoffset))
316		len = data_len;
317	else
318		len = PAGE_SIZE - sgoffset;
319
320	sgl->sg_length = len;
321
322	data_len -= len;
323	sgaddr += len;
324	sgl = sgl+1;
325
326	len = 0;
327	for (i = 0;  i< ctl_sg_count; i++)
328		len += ctl_sglist[i].len;
329	nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT;
330	while (data_len > 0) {
331		sgl->sg_addr = sgaddr;
332		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
333		sgl->sg_length = len;
334		sgaddr += len;
335		data_len -= len;
336		sgl = sgl + 1;
337		if (data_len == 0) {
338			if (index == ctl_sg_count - 1)
339				break;
340			index++;
341			sgaddr = ctl_sglist[index].addr;
342			data_len = ctl_sglist[index].len;
343		}
344	}
345
346	return nsge;
347}
348
349static int
350t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc,
351    u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag)
352{
353	struct cxgbei_ulp2_gather_list *gl;
354	int err = -EINVAL;
355	struct toepcb *toep = icc->toep;
356
357	gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0);
358	if (gl) {
359		err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid,
360		    &ci->tag_format, ddp_tag, gl, 0, 0);
361		if (err) {
362			cxgbei_ulp2_ddp_release_gl(ci, gl);
363		}
364	}
365
366	return err;
367}
368
369static unsigned int
370cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv,
371			struct ccb_scsiio *scmd, unsigned int *itt)
372{
373	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
374	int xferlen = scmd->dxfer_len;
375	struct cxgbei_task_data *tdata = NULL;
376	struct cxgbei_sgl *sge = NULL;
377	struct toepcb *toep = icc->toep;
378	struct adapter *sc = td_adapter(toep->td);
379	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
380	int err = -1;
381
382	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
383
384	tdata = (struct cxgbei_task_data *)*prv;
385	if (xferlen == 0 || tdata == NULL)
386		goto out;
387	if (xferlen < DDP_THRESHOLD)
388		goto out;
389
390	if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
391		tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd);
392		if (tdata->nsge == 0) {
393			CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
394			return 0;
395		}
396		sge = tdata->sgl;
397
398		tdata->sc_ddp_tag = *itt;
399
400		CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x",
401				__func__, *itt, tdata->sc_ddp_tag);
402		if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format,
403							tdata->sc_ddp_tag)) {
404			err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len,
405			    sge, tdata->nsge, &tdata->sc_ddp_tag);
406		} else {
407			CTR3(KTR_CXGBE,
408				"%s: itt:0x%x sc_ddp_tag:0x%x not usable",
409				__func__, *itt, tdata->sc_ddp_tag);
410		}
411	}
412out:
413	if (err < 0)
414		tdata->sc_ddp_tag =
415			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt);
416
417	return tdata->sc_ddp_tag;
418}
419
420static unsigned int
421cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io,
422				unsigned int *ttt)
423{
424	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
425	struct toepcb *toep = icc->toep;
426	struct adapter *sc = td_adapter(toep->td);
427	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
428	struct cxgbei_task_data *tdata = NULL;
429	int xferlen, err = -1;
430	struct cxgbei_sgl *sge = NULL;
431
432	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
433
434	xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
435	tdata = (struct cxgbei_task_data *)*prv;
436	if ((xferlen == 0) || (tdata == NULL))
437		goto out;
438	if (xferlen < DDP_THRESHOLD)
439		goto out;
440	tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io);
441	if (tdata->nsge == 0) {
442		CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
443		return 0;
444	}
445	sge = tdata->sgl;
446
447	tdata->sc_ddp_tag = *ttt;
448	if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) {
449		err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge,
450		    tdata->nsge, &tdata->sc_ddp_tag);
451	} else {
452		CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable",
453				__func__, tdata->sc_ddp_tag);
454	}
455out:
456	if (err < 0)
457		tdata->sc_ddp_tag =
458			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt);
459	return tdata->sc_ddp_tag;
460}
461
462static int
463t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag)
464{
465	struct toepcb *toep = icc->toep;
466	struct adapter *sc = td_adapter(toep->td);
467	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
468
469	cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc);
470
471	return (0);
472}
473
474static int
475cxgbei_ddp_init(struct adapter *sc, struct cxgbei_data *ci)
476{
477	int nppods, bits, max_sz, rc;
478	static const u_int pgsz_order[] = {0, 1, 2, 3};
479
480	MPASS(sc->vres.iscsi.size > 0);
481
482	ci->llimit = sc->vres.iscsi.start;
483	ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1;
484	max_sz = G_MAXRXDATA(t4_read_reg(sc, A_TP_PARA_REG2));
485
486	nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT;
487	if (nppods <= 1024)
488		return (ENXIO);
489
490	bits = fls(nppods);
491	if (bits > IPPOD_IDX_MAX_SIZE)
492		bits = IPPOD_IDX_MAX_SIZE;
493	nppods = (1 << (bits - 1)) - 1;
494
495	rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR,
496	    BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE,
497	    BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag);
498	if (rc != 0) {
499		device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n",
500		    __func__, rc);
501		return (rc);
502	}
503
504	ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO);
505	ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *),
506	    M_CXGBE, M_NOWAIT | M_ZERO);
507	if (ci->colors == NULL || ci->gl_map == NULL) {
508		bus_dma_tag_destroy(ci->ulp_ddp_tag);
509		free(ci->colors, M_CXGBE);
510		free(ci->gl_map, M_CXGBE);
511		return (ENOMEM);
512	}
513
514	mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK);
515	ci->max_txsz = ci->max_rxsz = min(max_sz, ULP2_MAX_PKT_SIZE);
516	ci->nppods = nppods;
517	ci->idx_last = nppods;
518	ci->idx_bits = bits;
519	ci->idx_mask = (1 << bits) - 1;
520	ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1;
521
522	ci->tag_format.sw_bits = bits;
523	ci->tag_format.rsvd_bits = bits;
524	ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT;
525	ci->tag_format.rsvd_mask = ci->idx_mask;
526
527	t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order);
528
529	return (rc);
530}
531
532static int
533do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
534{
535	struct adapter *sc = iq->adapter;
536	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
537	u_int tid = GET_TID(cpl);
538	struct toepcb *toep = lookup_tid(sc, tid);
539	struct icl_pdu *ip;
540	struct icl_cxgbei_pdu *icp;
541
542	M_ASSERTPKTHDR(m);
543
544	ip = icl_cxgbei_new_pdu(M_NOWAIT);
545	if (ip == NULL)
546		CXGBE_UNIMPLEMENTED("PDU allocation failure");
547	icp = ip_to_icp(ip);
548	bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct
549	    iscsi_bhs));
550	icp->pdu_seq = ntohl(cpl->seq);
551	icp->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD;
552
553	/* This is the start of a new PDU.  There should be no old state. */
554	MPASS(toep->ulpcb2 == NULL);
555	toep->ulpcb2 = icp;
556
557#if 0
558	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u",
559	    __func__, tid, ntohs(cpl->len), m->m_len);
560#endif
561
562	m_freem(m);
563	return (0);
564}
565
566static int
567do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
568{
569	struct adapter *sc = iq->adapter;
570	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
571	u_int tid = GET_TID(cpl);
572	struct toepcb *toep = lookup_tid(sc, tid);
573	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
574
575	M_ASSERTPKTHDR(m);
576
577	/* Must already have received the header (but not the data). */
578	MPASS(icp != NULL);
579	MPASS(icp->pdu_flags == SBUF_ULP_FLAG_HDR_RCVD);
580	MPASS(icp->ip.ip_data_mbuf == NULL);
581	MPASS(icp->ip.ip_data_len == 0);
582
583	m_adj(m, sizeof(*cpl));
584
585	icp->pdu_flags |= SBUF_ULP_FLAG_DATA_RCVD;
586	icp->ip.ip_data_mbuf = m;
587	icp->ip.ip_data_len = m->m_pkthdr.len;
588
589#if 0
590	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u",
591	    __func__, tid, ntohs(cpl->len), m->m_len);
592#endif
593
594	return (0);
595}
596
597static int
598do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
599{
600	struct adapter *sc = iq->adapter;
601	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
602	u_int tid = GET_TID(cpl);
603	struct toepcb *toep = lookup_tid(sc, tid);
604	struct inpcb *inp = toep->inp;
605	struct socket *so;
606	struct sockbuf *sb;
607	struct tcpcb *tp;
608	struct icl_cxgbei_conn *icc;
609	struct icl_conn *ic;
610	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
611	struct icl_pdu *ip;
612	u_int pdu_len, val;
613
614	MPASS(m == NULL);
615
616	/* Must already be assembling a PDU. */
617	MPASS(icp != NULL);
618	MPASS(icp->pdu_flags & SBUF_ULP_FLAG_HDR_RCVD);	/* Data is optional. */
619	ip = &icp->ip;
620	icp->pdu_flags |= SBUF_ULP_FLAG_STATUS_RCVD;
621	val = ntohl(cpl->ddpvld);
622	if (val & F_DDP_PADDING_ERR)
623		icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR;
624	if (val & F_DDP_HDRCRC_ERR)
625		icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR;
626	if (val & F_DDP_DATACRC_ERR)
627		icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR;
628	if (ip->ip_data_mbuf == NULL) {
629		/* XXXNP: what should ip->ip_data_len be, and why? */
630		icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED;
631	}
632	pdu_len = ntohs(cpl->len);	/* includes everything. */
633
634	INP_WLOCK(inp);
635	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
636		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
637		    __func__, tid, pdu_len, inp->inp_flags);
638		INP_WUNLOCK(inp);
639		icl_cxgbei_conn_pdu_free(NULL, ip);
640#ifdef INVARIANTS
641		toep->ulpcb2 = NULL;
642#endif
643		return (0);
644	}
645
646	tp = intotcpcb(inp);
647	MPASS(icp->pdu_seq == tp->rcv_nxt);
648	MPASS(tp->rcv_wnd >= pdu_len);
649	tp->rcv_nxt += pdu_len;
650	tp->rcv_wnd -= pdu_len;
651	tp->t_rcvtime = ticks;
652
653	/* update rx credits */
654	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
655
656	so = inp->inp_socket;
657	sb = &so->so_rcv;
658	SOCKBUF_LOCK(sb);
659
660	icc = toep->ulpcb;
661	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
662		CTR5(KTR_CXGBE,
663		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
664		    __func__, tid, pdu_len, icc, sb->sb_state);
665		SOCKBUF_UNLOCK(sb);
666		INP_WUNLOCK(inp);
667
668		INP_INFO_RLOCK(&V_tcbinfo);
669		INP_WLOCK(inp);
670		tp = tcp_drop(tp, ECONNRESET);
671		if (tp)
672			INP_WUNLOCK(inp);
673		INP_INFO_RUNLOCK(&V_tcbinfo);
674
675		icl_cxgbei_conn_pdu_free(NULL, ip);
676#ifdef INVARIANTS
677		toep->ulpcb2 = NULL;
678#endif
679		return (0);
680	}
681	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
682	ic = &icc->ic;
683	icl_cxgbei_new_pdu_set_conn(ip, ic);
684
685	MPASS(m == NULL); /* was unused, we'll use it now. */
686	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
687	if (__predict_false(m != NULL)) {
688		int len = m_length(m, NULL);
689
690		/*
691		 * PDUs were received before the tid transitioned to ULP mode.
692		 * Convert them to icl_cxgbei_pdus and send them to ICL before
693		 * the PDU in icp/ip.
694		 */
695		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
696		    len);
697
698		/* XXXNP: needs to be rewritten. */
699		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
700		    iscsi_bhs)) {
701			struct icl_cxgbei_pdu *icp0;
702			struct icl_pdu *ip0;
703
704			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
705			icl_cxgbei_new_pdu_set_conn(ip0, ic);
706			if (ip0 == NULL)
707				CXGBE_UNIMPLEMENTED("PDU allocation failure");
708			icp0 = ip_to_icp(ip0);
709			icp0->pdu_seq = 0; /* XXX */
710			icp0->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD |
711			    SBUF_ULP_FLAG_STATUS_RCVD;
712			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
713			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
714		}
715		m_freem(m);
716	}
717
718#if 0
719	CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x",
720	    __func__, tid, pdu_len, icp->pdu_flags);
721#endif
722
723	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
724	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
725		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
726
727		mtx_lock(&cwt->cwt_lock);
728		icc->rx_flags |= RXF_ACTIVE;
729		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
730		if (cwt->cwt_state == CWT_SLEEPING) {
731			cwt->cwt_state = CWT_RUNNING;
732			cv_signal(&cwt->cwt_cv);
733		}
734		mtx_unlock(&cwt->cwt_lock);
735	}
736	SOCKBUF_UNLOCK(sb);
737	INP_WUNLOCK(inp);
738
739#ifdef INVARIANTS
740	toep->ulpcb2 = NULL;
741#endif
742
743	return (0);
744}
745
746/* initiator */
747void
748cxgbei_conn_task_reserve_itt(void *conn, void **prv,
749				void *scmd, unsigned int *itt)
750{
751	unsigned int tag;
752	tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt);
753	if (tag)
754		*itt = htonl(tag);
755	return;
756}
757
758/* target */
759void
760cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv,
761				void *scmd, unsigned int *ttt)
762{
763	unsigned int tag;
764	tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt);
765	if (tag)
766		*ttt = htonl(tag);
767	return;
768}
769
770void
771cxgbei_cleanup_task(void *conn, void *ofld_priv)
772{
773	struct icl_conn *ic = (struct icl_conn *)conn;
774	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
775	struct cxgbei_task_data *tdata = ofld_priv;
776	struct adapter *sc = icc->sc;
777	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
778
779	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
780	MPASS(tdata != NULL);
781
782	if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag))
783		t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag);
784	memset(tdata, 0, sizeof(*tdata));
785}
786
787static int
788cxgbei_activate(struct adapter *sc)
789{
790	struct cxgbei_data *ci;
791	int rc;
792
793	ASSERT_SYNCHRONIZED_OP(sc);
794
795	if (uld_active(sc, ULD_ISCSI)) {
796		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
797		    __func__, sc));
798		return (0);
799	}
800
801	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
802		device_printf(sc->dev,
803		    "not iSCSI offload capable, or capability disabled.\n");
804		return (ENOSYS);
805	}
806
807	/* per-adapter softc for iSCSI */
808	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT);
809	if (ci == NULL)
810		return (ENOMEM);
811
812	rc = cxgbei_ddp_init(sc, ci);
813	if (rc != 0) {
814		free(ci, M_CXGBE);
815		return (rc);
816	}
817
818	sc->iscsi_ulp_softc = ci;
819
820	return (0);
821}
822
823static int
824cxgbei_deactivate(struct adapter *sc)
825{
826
827	ASSERT_SYNCHRONIZED_OP(sc);
828
829	if (sc->iscsi_ulp_softc != NULL) {
830		cxgbei_ddp_cleanup(sc->iscsi_ulp_softc);
831		free(sc->iscsi_ulp_softc, M_CXGBE);
832		sc->iscsi_ulp_softc = NULL;
833	}
834
835	return (0);
836}
837
838static void
839cxgbei_activate_all(struct adapter *sc, void *arg __unused)
840{
841
842	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
843		return;
844
845	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
846	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
847		(void) t4_activate_uld(sc, ULD_ISCSI);
848
849	end_synchronized_op(sc, 0);
850}
851
852static void
853cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
854{
855
856	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
857		return;
858
859	if (uld_active(sc, ULD_ISCSI))
860	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
861
862	end_synchronized_op(sc, 0);
863}
864
865static struct uld_info cxgbei_uld_info = {
866	.uld_id = ULD_ISCSI,
867	.activate = cxgbei_activate,
868	.deactivate = cxgbei_deactivate,
869};
870
871static void
872cwt_main(void *arg)
873{
874	struct cxgbei_worker_thread_softc *cwt = arg;
875	struct icl_cxgbei_conn *icc = NULL;
876	struct icl_conn *ic;
877	struct icl_pdu *ip;
878	struct sockbuf *sb;
879	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
880
881	MPASS(cwt != NULL);
882
883	mtx_lock(&cwt->cwt_lock);
884	MPASS(cwt->cwt_state == 0);
885	cwt->cwt_state = CWT_RUNNING;
886	cv_signal(&cwt->cwt_cv);
887
888	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
889		cwt->cwt_state = CWT_RUNNING;
890		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
891			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
892			mtx_unlock(&cwt->cwt_lock);
893
894			ic = &icc->ic;
895			sb = &ic->ic_socket->so_rcv;
896
897			SOCKBUF_LOCK(sb);
898			MPASS(icc->rx_flags & RXF_ACTIVE);
899			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
900				MPASS(STAILQ_EMPTY(&rx_pdus));
901				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
902				SOCKBUF_UNLOCK(sb);
903
904				/* Hand over PDUs to ICL. */
905				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
906					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
907					ic->ic_receive(ip);
908				}
909
910				SOCKBUF_LOCK(sb);
911				MPASS(STAILQ_EMPTY(&rx_pdus));
912			}
913			MPASS(icc->rx_flags & RXF_ACTIVE);
914			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
915			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
916				icc->rx_flags &= ~RXF_ACTIVE;
917			} else {
918				/*
919				 * More PDUs were received while we were busy
920				 * handing over the previous batch to ICL.
921				 * Re-add this connection to the end of the
922				 * queue.
923				 */
924				mtx_lock(&cwt->cwt_lock);
925				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
926				    rx_link);
927				mtx_unlock(&cwt->cwt_lock);
928			}
929			SOCKBUF_UNLOCK(sb);
930
931			mtx_lock(&cwt->cwt_lock);
932		}
933
934		/* Inner loop doesn't check for CWT_STOP, do that first. */
935		if (__predict_false(cwt->cwt_state == CWT_STOP))
936			break;
937		cwt->cwt_state = CWT_SLEEPING;
938		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
939	}
940
941	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
942	mtx_assert(&cwt->cwt_lock, MA_OWNED);
943	cwt->cwt_state = CWT_STOPPED;
944	cv_signal(&cwt->cwt_cv);
945	mtx_unlock(&cwt->cwt_lock);
946	kthread_exit();
947}
948
949static int
950start_worker_threads(void)
951{
952	int i, rc;
953	struct cxgbei_worker_thread_softc *cwt;
954
955	worker_thread_count = min(mp_ncpus, 32);
956	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
957	    M_WAITOK | M_ZERO);
958
959	MPASS(cxgbei_proc == NULL);
960	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
961		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
962		cv_init(&cwt->cwt_cv, "cwt cv");
963		TAILQ_INIT(&cwt->rx_head);
964		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
965		    "cxgbei", "%d", i);
966		if (rc != 0) {
967			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
968			    i + 1, worker_thread_count, rc);
969			mtx_destroy(&cwt->cwt_lock);
970			cv_destroy(&cwt->cwt_cv);
971			bzero(cwt, sizeof(*cwt));
972			if (i == 0) {
973				free(cwt_softc, M_CXGBE);
974				worker_thread_count = 0;
975
976				return (rc);
977			}
978
979			/* Not fatal, carry on with fewer threads. */
980			worker_thread_count = i;
981			rc = 0;
982			break;
983		}
984
985		/* Wait for thread to start before moving on to the next one. */
986		mtx_lock(&cwt->cwt_lock);
987		while (cwt->cwt_state == 0)
988			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
989		mtx_unlock(&cwt->cwt_lock);
990	}
991
992	MPASS(cwt_softc != NULL);
993	MPASS(worker_thread_count > 0);
994	return (0);
995}
996
997static void
998stop_worker_threads(void)
999{
1000	int i;
1001	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
1002
1003	MPASS(worker_thread_count >= 0);
1004
1005	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1006		mtx_lock(&cwt->cwt_lock);
1007		MPASS(cwt->cwt_state == CWT_RUNNING ||
1008		    cwt->cwt_state == CWT_SLEEPING);
1009		cwt->cwt_state = CWT_STOP;
1010		cv_signal(&cwt->cwt_cv);
1011		do {
1012			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1013		} while (cwt->cwt_state != CWT_STOPPED);
1014		mtx_unlock(&cwt->cwt_lock);
1015	}
1016	free(cwt_softc, M_CXGBE);
1017}
1018
1019/* Select a worker thread for a connection. */
1020u_int
1021cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
1022{
1023	struct adapter *sc = icc->sc;
1024	struct toepcb *toep = icc->toep;
1025	u_int i, n;
1026
1027	n = worker_thread_count / sc->sge.nofldrxq;
1028	if (n > 0)
1029		i = toep->vi->pi->port_id * n + arc4random() % n;
1030	else
1031		i = arc4random() % worker_thread_count;
1032
1033	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
1034
1035	return (i);
1036}
1037
1038static int
1039cxgbei_mod_load(void)
1040{
1041	int rc;
1042
1043	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
1044	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
1045	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
1046
1047	rc = start_worker_threads();
1048	if (rc != 0)
1049		return (rc);
1050
1051	rc = t4_register_uld(&cxgbei_uld_info);
1052	if (rc != 0) {
1053		stop_worker_threads();
1054		return (rc);
1055	}
1056
1057	t4_iterate(cxgbei_activate_all, NULL);
1058
1059	return (rc);
1060}
1061
1062static int
1063cxgbei_mod_unload(void)
1064{
1065
1066	t4_iterate(cxgbei_deactivate_all, NULL);
1067
1068	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
1069		return (EBUSY);
1070
1071	stop_worker_threads();
1072
1073	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
1074	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
1075	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
1076
1077	return (0);
1078}
1079#endif
1080
1081static int
1082cxgbei_modevent(module_t mod, int cmd, void *arg)
1083{
1084	int rc = 0;
1085
1086#ifdef TCP_OFFLOAD
1087	switch (cmd) {
1088	case MOD_LOAD:
1089		rc = cxgbei_mod_load();
1090		break;
1091
1092	case MOD_UNLOAD:
1093		rc = cxgbei_mod_unload();
1094		break;
1095
1096	default:
1097		rc = EINVAL;
1098	}
1099#else
1100	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1101	rc = EOPNOTSUPP;
1102#endif
1103
1104	return (rc);
1105}
1106
1107static moduledata_t cxgbei_mod = {
1108	"cxgbei",
1109	cxgbei_modevent,
1110	NULL,
1111};
1112
1113MODULE_VERSION(cxgbei, 1);
1114DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1115MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1116MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1117MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1118