cxgbei.c revision 309555
1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Chelsio T5xx iSCSI driver
6 *
7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: stable/11/sys/dev/cxgbe/cxgbei/cxgbei.c 309555 2016-12-05 19:15:33Z jhb $");
33
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/types.h>
38#include <sys/param.h>
39#include <sys/kernel.h>
40#include <sys/module.h>
41#include <sys/systm.h>
42
43#ifdef TCP_OFFLOAD
44#include <sys/errno.h>
45#include <sys/kthread.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/mbuf.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/condvar.h>
53
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/toecore.h>
57#include <netinet/tcp_var.h>
58#include <netinet/tcp_fsm.h>
59
60#include <cam/scsi/scsi_all.h>
61#include <cam/scsi/scsi_da.h>
62#include <cam/ctl/ctl_io.h>
63#include <cam/ctl/ctl.h>
64#include <cam/ctl/ctl_backend.h>
65#include <cam/ctl/ctl_error.h>
66#include <cam/ctl/ctl_frontend.h>
67#include <cam/ctl/ctl_debug.h>
68#include <cam/ctl/ctl_ha.h>
69#include <cam/ctl/ctl_ioctl.h>
70
71#include <dev/iscsi/icl.h>
72#include <dev/iscsi/iscsi_proto.h>
73#include <dev/iscsi/iscsi_ioctl.h>
74#include <dev/iscsi/iscsi.h>
75#include <cam/ctl/ctl_frontend_iscsi.h>
76
77#include <cam/cam.h>
78#include <cam/cam_ccb.h>
79#include <cam/cam_xpt.h>
80#include <cam/cam_debug.h>
81#include <cam/cam_sim.h>
82#include <cam/cam_xpt_sim.h>
83#include <cam/cam_xpt_periph.h>
84#include <cam/cam_periph.h>
85#include <cam/cam_compat.h>
86#include <cam/scsi/scsi_message.h>
87
88#include "common/common.h"
89#include "common/t4_msg.h"
90#include "common/t4_regs.h"     /* for PCIE_MEM_ACCESS */
91#include "tom/t4_tom.h"
92#include "cxgbei.h"
93#include "cxgbei_ulp2_ddp.h"
94
95static int worker_thread_count;
96static struct cxgbei_worker_thread_softc *cwt_softc;
97static struct proc *cxgbei_proc;
98
99/* XXXNP some header instead. */
100struct icl_pdu *icl_cxgbei_new_pdu(int);
101void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
102void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
103
104/*
105 * Direct Data Placement -
106 * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
107 * final destination host-memory buffers based on the Initiator Task Tag (ITT)
108 * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
109 * The host memory address is programmed into h/w in the format of pagepod
110 * entries.
111 * The location of the pagepod entry is encoded into ddp tag which is used as
112 * the base for ITT/TTT.
113 */
114
115/*
116 * functions to program the pagepod in h/w
117 */
118static void inline
119ppod_set(struct pagepod *ppod,
120	struct cxgbei_ulp2_pagepod_hdr *hdr,
121	struct cxgbei_ulp2_gather_list *gl,
122	unsigned int pidx)
123{
124	int i;
125
126	memcpy(ppod, hdr, sizeof(*hdr));
127
128	for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) {
129		ppod->addr[i] = pidx < gl->nelem ?
130			cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL;
131	}
132}
133
134static void inline
135ppod_clear(struct pagepod *ppod)
136{
137	memset(ppod, 0, sizeof(*ppod));
138}
139
140static inline void
141ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req,
142		unsigned int wr_len, unsigned int dlen,
143		unsigned int pm_addr)
144{
145	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
146
147	INIT_ULPTX_WR(req, wr_len, 0, 0);
148	req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
149				V_ULP_MEMIO_ORDER(is_t4(sc)) |
150				V_T5_ULP_MEMIO_IMM(is_t5(sc)));
151	req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5));
152	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)
153				| V_FW_WR_FLOWID(tid));
154	req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5));
155
156	idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM));
157	idata->len = htonl(dlen);
158}
159
160#define ULPMEM_IDATA_MAX_NPPODS 1	/* 256/PPOD_SIZE */
161#define PCIE_MEMWIN_MAX_NPPODS 16	/* 1024/PPOD_SIZE */
162
163static int
164ppod_write_idata(struct cxgbei_data *ci,
165			struct cxgbei_ulp2_pagepod_hdr *hdr,
166			unsigned int idx, unsigned int npods,
167			struct cxgbei_ulp2_gather_list *gl,
168			unsigned int gl_pidx, struct toepcb *toep)
169{
170	u_int dlen = PPOD_SIZE * npods;
171	u_int pm_addr = idx * PPOD_SIZE + ci->llimit;
172	u_int wr_len = roundup(sizeof(struct ulp_mem_io) +
173	    sizeof(struct ulptx_idata) + dlen, 16);
174	struct ulp_mem_io *req;
175	struct ulptx_idata *idata;
176	struct pagepod *ppod;
177	u_int i;
178	struct wrqe *wr;
179	struct adapter *sc = toep->vi->pi->adapter;
180
181	wr = alloc_wrqe(wr_len, toep->ctrlq);
182	if (wr == NULL) {
183		CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure");
184		return (ENOMEM);
185	}
186
187	req = wrtod(wr);
188	memset(req, 0, wr_len);
189	ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr);
190	idata = (struct ulptx_idata *)(req + 1);
191
192	ppod = (struct pagepod *)(idata + 1);
193	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) {
194		if (!hdr) /* clear the pagepod */
195			ppod_clear(ppod);
196		else /* set the pagepod */
197			ppod_set(ppod, hdr, gl, gl_pidx);
198	}
199
200	t4_wrq_tx(sc, wr);
201	return 0;
202}
203
204int
205t4_ddp_set_map(struct cxgbei_data *ci, void *iccp,
206    struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods,
207    struct cxgbei_ulp2_gather_list *gl, int reply)
208{
209	struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp;
210	struct toepcb *toep = icc->toep;
211	int err;
212	unsigned int pidx = 0, w_npods = 0, cnt;
213
214	/*
215	 * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE,
216	 * the order would not be guaranteed, so we will stick with IMMD
217	 */
218	gl->tid = toep->tid;
219	gl->port_id = toep->vi->pi->port_id;
220	gl->egress_dev = (void *)toep->vi->ifp;
221
222	/* send via immediate data */
223	for (; w_npods < npods; idx += cnt, w_npods += cnt,
224		pidx += PPOD_PAGES) {
225		cnt = npods - w_npods;
226		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
227			cnt = ULPMEM_IDATA_MAX_NPPODS;
228		err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep);
229		if (err) {
230			printf("%s: ppod_write_idata failed\n", __func__);
231			break;
232		}
233	}
234	return err;
235}
236
237void
238t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl,
239    u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc)
240{
241	struct toepcb *toep = icc->toep;
242	int err = -1;
243	u_int pidx = 0;
244	u_int w_npods = 0;
245	u_int cnt;
246
247	for (; w_npods < npods; idx += cnt, w_npods += cnt,
248		pidx += PPOD_PAGES) {
249		cnt = npods - w_npods;
250		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
251			cnt = ULPMEM_IDATA_MAX_NPPODS;
252		err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep);
253		if (err)
254			break;
255	}
256}
257
258static int
259cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio)
260{
261	unsigned int data_len = csio->dxfer_len;
262	unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK;
263	unsigned int nsge;
264	unsigned char *sgaddr = csio->data_ptr;
265	unsigned int len = 0;
266
267	nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
268	sgl->sg_addr = sgaddr;
269	sgl->sg_offset = sgoffset;
270	if (data_len <  (PAGE_SIZE - sgoffset))
271		len = data_len;
272	else
273		len = PAGE_SIZE - sgoffset;
274
275	sgl->sg_length = len;
276
277	data_len -= len;
278	sgaddr += len;
279	sgl = sgl+1;
280
281	while (data_len > 0) {
282		sgl->sg_addr = sgaddr;
283		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
284		sgl->sg_length = len;
285	        sgaddr += len;
286		data_len -= len;
287		sgl = sgl + 1;
288	}
289
290	return nsge;
291}
292
293static int
294cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io)
295{
296	unsigned int data_len, sgoffset, nsge;
297	unsigned char *sgaddr;
298	unsigned int len = 0, index = 0, ctl_sg_count, i;
299	struct ctl_sg_entry ctl_sg_entry, *ctl_sglist;
300
301	if (io->scsiio.kern_sg_entries > 0) {
302		ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
303		ctl_sg_count = io->scsiio.kern_sg_entries;
304	} else {
305		ctl_sglist = &ctl_sg_entry;
306		ctl_sglist->addr = io->scsiio.kern_data_ptr;
307		ctl_sglist->len = io->scsiio.kern_data_len;
308		ctl_sg_count = 1;
309	}
310
311	sgaddr = sgl->sg_addr = ctl_sglist[index].addr;
312	sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK;
313	data_len = ctl_sglist[index].len;
314
315	if (data_len <  (PAGE_SIZE - sgoffset))
316		len = data_len;
317	else
318		len = PAGE_SIZE - sgoffset;
319
320	sgl->sg_length = len;
321
322	data_len -= len;
323	sgaddr += len;
324	sgl = sgl+1;
325
326	len = 0;
327	for (i = 0;  i< ctl_sg_count; i++)
328		len += ctl_sglist[i].len;
329	nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT;
330	while (data_len > 0) {
331		sgl->sg_addr = sgaddr;
332		len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE;
333		sgl->sg_length = len;
334		sgaddr += len;
335		data_len -= len;
336		sgl = sgl + 1;
337		if (data_len == 0) {
338			if (index == ctl_sg_count - 1)
339				break;
340			index++;
341			sgaddr = ctl_sglist[index].addr;
342			data_len = ctl_sglist[index].len;
343		}
344	}
345
346	return nsge;
347}
348
349static int
350t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc,
351    u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag)
352{
353	struct cxgbei_ulp2_gather_list *gl;
354	int err = -EINVAL;
355	struct toepcb *toep = icc->toep;
356
357	gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0);
358	if (gl) {
359		err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid,
360		    &ci->tag_format, ddp_tag, gl, 0, 0);
361		if (err) {
362			cxgbei_ulp2_ddp_release_gl(ci, gl);
363		}
364	}
365
366	return err;
367}
368
369static unsigned int
370cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv,
371			struct ccb_scsiio *scmd, unsigned int *itt)
372{
373	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
374	int xferlen = scmd->dxfer_len;
375	struct cxgbei_task_data *tdata = NULL;
376	struct cxgbei_sgl *sge = NULL;
377	struct toepcb *toep = icc->toep;
378	struct adapter *sc = td_adapter(toep->td);
379	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
380	int err = -1;
381
382	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
383
384	tdata = (struct cxgbei_task_data *)*prv;
385	if (xferlen == 0 || tdata == NULL)
386		goto out;
387	if (xferlen < DDP_THRESHOLD)
388		goto out;
389
390	if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
391		tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd);
392		if (tdata->nsge == 0) {
393			CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
394			return 0;
395		}
396		sge = tdata->sgl;
397
398		tdata->sc_ddp_tag = *itt;
399
400		CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x",
401				__func__, *itt, tdata->sc_ddp_tag);
402		if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format,
403							tdata->sc_ddp_tag)) {
404			err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len,
405			    sge, tdata->nsge, &tdata->sc_ddp_tag);
406		} else {
407			CTR3(KTR_CXGBE,
408				"%s: itt:0x%x sc_ddp_tag:0x%x not usable",
409				__func__, *itt, tdata->sc_ddp_tag);
410		}
411	}
412out:
413	if (err < 0)
414		tdata->sc_ddp_tag =
415			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt);
416
417	return tdata->sc_ddp_tag;
418}
419
420static unsigned int
421cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io,
422				unsigned int *ttt)
423{
424	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
425	struct toepcb *toep = icc->toep;
426	struct adapter *sc = td_adapter(toep->td);
427	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
428	struct cxgbei_task_data *tdata = NULL;
429	int xferlen, err = -1;
430	struct cxgbei_sgl *sge = NULL;
431
432	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
433
434	xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled);
435	tdata = (struct cxgbei_task_data *)*prv;
436	if ((xferlen == 0) || (tdata == NULL))
437		goto out;
438	if (xferlen < DDP_THRESHOLD)
439		goto out;
440	tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io);
441	if (tdata->nsge == 0) {
442		CTR1(KTR_CXGBE, "%s: map_sg failed", __func__);
443		return 0;
444	}
445	sge = tdata->sgl;
446
447	tdata->sc_ddp_tag = *ttt;
448	if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) {
449		err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge,
450		    tdata->nsge, &tdata->sc_ddp_tag);
451	} else {
452		CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable",
453				__func__, tdata->sc_ddp_tag);
454	}
455out:
456	if (err < 0)
457		tdata->sc_ddp_tag =
458			cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt);
459	return tdata->sc_ddp_tag;
460}
461
462static int
463t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag)
464{
465	struct toepcb *toep = icc->toep;
466	struct adapter *sc = td_adapter(toep->td);
467	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
468
469	cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc);
470
471	return (0);
472}
473
474static int
475cxgbei_ddp_init(struct adapter *sc, struct cxgbei_data *ci)
476{
477	int nppods, bits, max_sz, rc;
478	static const u_int pgsz_order[] = {0, 1, 2, 3};
479
480	MPASS(sc->vres.iscsi.size > 0);
481
482	ci->llimit = sc->vres.iscsi.start;
483	ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1;
484	max_sz = G_MAXRXDATA(t4_read_reg(sc, A_TP_PARA_REG2));
485
486	nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT;
487	if (nppods <= 1024)
488		return (ENXIO);
489
490	bits = fls(nppods);
491	if (bits > IPPOD_IDX_MAX_SIZE)
492		bits = IPPOD_IDX_MAX_SIZE;
493	nppods = (1 << (bits - 1)) - 1;
494
495	rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR,
496	    BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE,
497	    BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag);
498	if (rc != 0) {
499		device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n",
500		    __func__, rc);
501		return (rc);
502	}
503
504	ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO);
505	ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *),
506	    M_CXGBE, M_NOWAIT | M_ZERO);
507	if (ci->colors == NULL || ci->gl_map == NULL) {
508		bus_dma_tag_destroy(ci->ulp_ddp_tag);
509		free(ci->colors, M_CXGBE);
510		free(ci->gl_map, M_CXGBE);
511		return (ENOMEM);
512	}
513
514	mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK);
515	ci->max_txsz = ci->max_rxsz = min(max_sz, ULP2_MAX_PKT_SIZE);
516	ci->nppods = nppods;
517	ci->idx_last = nppods;
518	ci->idx_bits = bits;
519	ci->idx_mask = (1 << bits) - 1;
520	ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1;
521
522	ci->tag_format.sw_bits = bits;
523	ci->tag_format.rsvd_bits = bits;
524	ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT;
525	ci->tag_format.rsvd_mask = ci->idx_mask;
526
527	t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order);
528
529	return (rc);
530}
531
532static int
533do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
534{
535	struct adapter *sc = iq->adapter;
536	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
537	u_int tid = GET_TID(cpl);
538	struct toepcb *toep = lookup_tid(sc, tid);
539	struct icl_pdu *ip;
540	struct icl_cxgbei_pdu *icp;
541
542	M_ASSERTPKTHDR(m);
543
544	ip = icl_cxgbei_new_pdu(M_NOWAIT);
545	if (ip == NULL)
546		CXGBE_UNIMPLEMENTED("PDU allocation failure");
547	icp = ip_to_icp(ip);
548	bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct
549	    iscsi_bhs));
550	icp->pdu_seq = ntohl(cpl->seq);
551	icp->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD;
552
553	/* This is the start of a new PDU.  There should be no old state. */
554	MPASS(toep->ulpcb2 == NULL);
555	toep->ulpcb2 = icp;
556
557#if 0
558	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u",
559	    __func__, tid, ntohs(cpl->len), m->m_len);
560#endif
561
562	m_freem(m);
563	return (0);
564}
565
566static int
567do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
568{
569	struct adapter *sc = iq->adapter;
570	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
571	u_int tid = GET_TID(cpl);
572	struct toepcb *toep = lookup_tid(sc, tid);
573	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
574
575	M_ASSERTPKTHDR(m);
576
577	/* Must already have received the header (but not the data). */
578	MPASS(icp != NULL);
579	MPASS(icp->pdu_flags == SBUF_ULP_FLAG_HDR_RCVD);
580	MPASS(icp->ip.ip_data_mbuf == NULL);
581	MPASS(icp->ip.ip_data_len == 0);
582
583	m_adj(m, sizeof(*cpl));
584
585	icp->pdu_flags |= SBUF_ULP_FLAG_DATA_RCVD;
586	icp->ip.ip_data_mbuf = m;
587	icp->ip.ip_data_len = m->m_pkthdr.len;
588
589#if 0
590	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u",
591	    __func__, tid, ntohs(cpl->len), m->m_len);
592#endif
593
594	return (0);
595}
596
597static int
598do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
599{
600	struct adapter *sc = iq->adapter;
601	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
602	u_int tid = GET_TID(cpl);
603	struct toepcb *toep = lookup_tid(sc, tid);
604	struct inpcb *inp = toep->inp;
605	struct socket *so;
606	struct sockbuf *sb;
607	struct tcpcb *tp;
608	struct icl_cxgbei_conn *icc;
609	struct icl_conn *ic;
610	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
611	struct icl_pdu *ip;
612	u_int pdu_len, val;
613
614	MPASS(m == NULL);
615
616	/* Must already be assembling a PDU. */
617	MPASS(icp != NULL);
618	MPASS(icp->pdu_flags & SBUF_ULP_FLAG_HDR_RCVD);	/* Data is optional. */
619	ip = &icp->ip;
620	icp->pdu_flags |= SBUF_ULP_FLAG_STATUS_RCVD;
621	val = ntohl(cpl->ddpvld);
622	if (val & F_DDP_PADDING_ERR)
623		icp->pdu_flags |= SBUF_ULP_FLAG_PAD_ERROR;
624	if (val & F_DDP_HDRCRC_ERR)
625		icp->pdu_flags |= SBUF_ULP_FLAG_HCRC_ERROR;
626	if (val & F_DDP_DATACRC_ERR)
627		icp->pdu_flags |= SBUF_ULP_FLAG_DCRC_ERROR;
628	if (ip->ip_data_mbuf == NULL) {
629		/* XXXNP: what should ip->ip_data_len be, and why? */
630		icp->pdu_flags |= SBUF_ULP_FLAG_DATA_DDPED;
631	}
632	pdu_len = ntohs(cpl->len);	/* includes everything. */
633
634	INP_WLOCK(inp);
635	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
636		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
637		    __func__, tid, pdu_len, inp->inp_flags);
638		INP_WUNLOCK(inp);
639		icl_cxgbei_conn_pdu_free(NULL, ip);
640#ifdef INVARIANTS
641		toep->ulpcb2 = NULL;
642#endif
643		return (0);
644	}
645
646	tp = intotcpcb(inp);
647	MPASS(icp->pdu_seq == tp->rcv_nxt);
648	MPASS(tp->rcv_wnd >= pdu_len);
649	tp->rcv_nxt += pdu_len;
650	tp->rcv_wnd -= pdu_len;
651	tp->t_rcvtime = ticks;
652
653	/* update rx credits */
654	toep->rx_credits += pdu_len;
655	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
656
657	so = inp->inp_socket;
658	sb = &so->so_rcv;
659	SOCKBUF_LOCK(sb);
660
661	icc = toep->ulpcb;
662	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
663		CTR5(KTR_CXGBE,
664		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
665		    __func__, tid, pdu_len, icc, sb->sb_state);
666		SOCKBUF_UNLOCK(sb);
667		INP_WUNLOCK(inp);
668
669		INP_INFO_RLOCK(&V_tcbinfo);
670		INP_WLOCK(inp);
671		tp = tcp_drop(tp, ECONNRESET);
672		if (tp)
673			INP_WUNLOCK(inp);
674		INP_INFO_RUNLOCK(&V_tcbinfo);
675
676		icl_cxgbei_conn_pdu_free(NULL, ip);
677#ifdef INVARIANTS
678		toep->ulpcb2 = NULL;
679#endif
680		return (0);
681	}
682	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
683	ic = &icc->ic;
684	icl_cxgbei_new_pdu_set_conn(ip, ic);
685
686	MPASS(m == NULL); /* was unused, we'll use it now. */
687	m = sbcut_locked(sb, sbused(sb)); /* XXXNP: toep->sb_cc accounting? */
688	if (__predict_false(m != NULL)) {
689		int len = m_length(m, NULL);
690
691		/*
692		 * PDUs were received before the tid transitioned to ULP mode.
693		 * Convert them to icl_cxgbei_pdus and send them to ICL before
694		 * the PDU in icp/ip.
695		 */
696		CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, tid,
697		    len);
698
699		/* XXXNP: needs to be rewritten. */
700		if (len == sizeof(struct iscsi_bhs) || len == 4 + sizeof(struct
701		    iscsi_bhs)) {
702			struct icl_cxgbei_pdu *icp0;
703			struct icl_pdu *ip0;
704
705			ip0 = icl_cxgbei_new_pdu(M_NOWAIT);
706			icl_cxgbei_new_pdu_set_conn(ip0, ic);
707			if (ip0 == NULL)
708				CXGBE_UNIMPLEMENTED("PDU allocation failure");
709			icp0 = ip_to_icp(ip0);
710			icp0->pdu_seq = 0; /* XXX */
711			icp0->pdu_flags = SBUF_ULP_FLAG_HDR_RCVD |
712			    SBUF_ULP_FLAG_STATUS_RCVD;
713			m_copydata(m, 0, sizeof(struct iscsi_bhs), (void *)ip0->ip_bhs);
714			STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip0, ip_next);
715		}
716		m_freem(m);
717	}
718
719#if 0
720	CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x",
721	    __func__, tid, pdu_len, icp->pdu_flags);
722#endif
723
724	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
725	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
726		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
727
728		mtx_lock(&cwt->cwt_lock);
729		icc->rx_flags |= RXF_ACTIVE;
730		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
731		if (cwt->cwt_state == CWT_SLEEPING) {
732			cwt->cwt_state = CWT_RUNNING;
733			cv_signal(&cwt->cwt_cv);
734		}
735		mtx_unlock(&cwt->cwt_lock);
736	}
737	SOCKBUF_UNLOCK(sb);
738	INP_WUNLOCK(inp);
739
740#ifdef INVARIANTS
741	toep->ulpcb2 = NULL;
742#endif
743
744	return (0);
745}
746
747/* initiator */
748void
749cxgbei_conn_task_reserve_itt(void *conn, void **prv,
750				void *scmd, unsigned int *itt)
751{
752	unsigned int tag;
753	tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt);
754	if (tag)
755		*itt = htonl(tag);
756	return;
757}
758
759/* target */
760void
761cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv,
762				void *scmd, unsigned int *ttt)
763{
764	unsigned int tag;
765	tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt);
766	if (tag)
767		*ttt = htonl(tag);
768	return;
769}
770
771void
772cxgbei_cleanup_task(void *conn, void *ofld_priv)
773{
774	struct icl_conn *ic = (struct icl_conn *)conn;
775	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
776	struct cxgbei_task_data *tdata = ofld_priv;
777	struct adapter *sc = icc->sc;
778	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
779
780	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
781	MPASS(tdata != NULL);
782
783	if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag))
784		t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag);
785	memset(tdata, 0, sizeof(*tdata));
786}
787
788static int
789cxgbei_activate(struct adapter *sc)
790{
791	struct cxgbei_data *ci;
792	int rc;
793
794	ASSERT_SYNCHRONIZED_OP(sc);
795
796	if (uld_active(sc, ULD_ISCSI)) {
797		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
798		    __func__, sc));
799		return (0);
800	}
801
802	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
803		device_printf(sc->dev,
804		    "not iSCSI offload capable, or capability disabled.\n");
805		return (ENOSYS);
806	}
807
808	/* per-adapter softc for iSCSI */
809	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT);
810	if (ci == NULL)
811		return (ENOMEM);
812
813	rc = cxgbei_ddp_init(sc, ci);
814	if (rc != 0) {
815		free(ci, M_CXGBE);
816		return (rc);
817	}
818
819	sc->iscsi_ulp_softc = ci;
820
821	return (0);
822}
823
824static int
825cxgbei_deactivate(struct adapter *sc)
826{
827
828	ASSERT_SYNCHRONIZED_OP(sc);
829
830	if (sc->iscsi_ulp_softc != NULL) {
831		cxgbei_ddp_cleanup(sc->iscsi_ulp_softc);
832		free(sc->iscsi_ulp_softc, M_CXGBE);
833		sc->iscsi_ulp_softc = NULL;
834	}
835
836	return (0);
837}
838
839static void
840cxgbei_activate_all(struct adapter *sc, void *arg __unused)
841{
842
843	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
844		return;
845
846	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
847	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
848		(void) t4_activate_uld(sc, ULD_ISCSI);
849
850	end_synchronized_op(sc, 0);
851}
852
853static void
854cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
855{
856
857	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
858		return;
859
860	if (uld_active(sc, ULD_ISCSI))
861	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
862
863	end_synchronized_op(sc, 0);
864}
865
866static struct uld_info cxgbei_uld_info = {
867	.uld_id = ULD_ISCSI,
868	.activate = cxgbei_activate,
869	.deactivate = cxgbei_deactivate,
870};
871
872static void
873cwt_main(void *arg)
874{
875	struct cxgbei_worker_thread_softc *cwt = arg;
876	struct icl_cxgbei_conn *icc = NULL;
877	struct icl_conn *ic;
878	struct icl_pdu *ip;
879	struct sockbuf *sb;
880	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
881
882	MPASS(cwt != NULL);
883
884	mtx_lock(&cwt->cwt_lock);
885	MPASS(cwt->cwt_state == 0);
886	cwt->cwt_state = CWT_RUNNING;
887	cv_signal(&cwt->cwt_cv);
888
889	while (__predict_true(cwt->cwt_state != CWT_STOP)) {
890		cwt->cwt_state = CWT_RUNNING;
891		while ((icc = TAILQ_FIRST(&cwt->rx_head)) != NULL) {
892			TAILQ_REMOVE(&cwt->rx_head, icc, rx_link);
893			mtx_unlock(&cwt->cwt_lock);
894
895			ic = &icc->ic;
896			sb = &ic->ic_socket->so_rcv;
897
898			SOCKBUF_LOCK(sb);
899			MPASS(icc->rx_flags & RXF_ACTIVE);
900			if (__predict_true(!(sb->sb_state & SBS_CANTRCVMORE))) {
901				MPASS(STAILQ_EMPTY(&rx_pdus));
902				STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
903				SOCKBUF_UNLOCK(sb);
904
905				/* Hand over PDUs to ICL. */
906				while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
907					STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
908					ic->ic_receive(ip);
909				}
910
911				SOCKBUF_LOCK(sb);
912				MPASS(STAILQ_EMPTY(&rx_pdus));
913			}
914			MPASS(icc->rx_flags & RXF_ACTIVE);
915			if (STAILQ_EMPTY(&icc->rcvd_pdus) ||
916			    __predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
917				icc->rx_flags &= ~RXF_ACTIVE;
918			} else {
919				/*
920				 * More PDUs were received while we were busy
921				 * handing over the previous batch to ICL.
922				 * Re-add this connection to the end of the
923				 * queue.
924				 */
925				mtx_lock(&cwt->cwt_lock);
926				TAILQ_INSERT_TAIL(&cwt->rx_head, icc,
927				    rx_link);
928				mtx_unlock(&cwt->cwt_lock);
929			}
930			SOCKBUF_UNLOCK(sb);
931
932			mtx_lock(&cwt->cwt_lock);
933		}
934
935		/* Inner loop doesn't check for CWT_STOP, do that first. */
936		if (__predict_false(cwt->cwt_state == CWT_STOP))
937			break;
938		cwt->cwt_state = CWT_SLEEPING;
939		cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
940	}
941
942	MPASS(TAILQ_FIRST(&cwt->rx_head) == NULL);
943	mtx_assert(&cwt->cwt_lock, MA_OWNED);
944	cwt->cwt_state = CWT_STOPPED;
945	cv_signal(&cwt->cwt_cv);
946	mtx_unlock(&cwt->cwt_lock);
947	kthread_exit();
948}
949
950static int
951start_worker_threads(void)
952{
953	int i, rc;
954	struct cxgbei_worker_thread_softc *cwt;
955
956	worker_thread_count = min(mp_ncpus, 32);
957	cwt_softc = malloc(worker_thread_count * sizeof(*cwt), M_CXGBE,
958	    M_WAITOK | M_ZERO);
959
960	MPASS(cxgbei_proc == NULL);
961	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
962		mtx_init(&cwt->cwt_lock, "cwt lock", NULL, MTX_DEF);
963		cv_init(&cwt->cwt_cv, "cwt cv");
964		TAILQ_INIT(&cwt->rx_head);
965		rc = kproc_kthread_add(cwt_main, cwt, &cxgbei_proc, NULL, 0, 0,
966		    "cxgbei", "%d", i);
967		if (rc != 0) {
968			printf("cxgbei: failed to start thread #%d/%d (%d)\n",
969			    i + 1, worker_thread_count, rc);
970			mtx_destroy(&cwt->cwt_lock);
971			cv_destroy(&cwt->cwt_cv);
972			bzero(&cwt, sizeof(*cwt));
973			if (i == 0) {
974				free(cwt_softc, M_CXGBE);
975				worker_thread_count = 0;
976
977				return (rc);
978			}
979
980			/* Not fatal, carry on with fewer threads. */
981			worker_thread_count = i;
982			rc = 0;
983			break;
984		}
985
986		/* Wait for thread to start before moving on to the next one. */
987		mtx_lock(&cwt->cwt_lock);
988		while (cwt->cwt_state == 0)
989			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
990		mtx_unlock(&cwt->cwt_lock);
991	}
992
993	MPASS(cwt_softc != NULL);
994	MPASS(worker_thread_count > 0);
995	return (0);
996}
997
998static void
999stop_worker_threads(void)
1000{
1001	int i;
1002	struct cxgbei_worker_thread_softc *cwt = &cwt_softc[0];
1003
1004	MPASS(worker_thread_count >= 0);
1005
1006	for (i = 0, cwt = &cwt_softc[0]; i < worker_thread_count; i++, cwt++) {
1007		mtx_lock(&cwt->cwt_lock);
1008		MPASS(cwt->cwt_state == CWT_RUNNING ||
1009		    cwt->cwt_state == CWT_SLEEPING);
1010		cwt->cwt_state = CWT_STOP;
1011		cv_signal(&cwt->cwt_cv);
1012		do {
1013			cv_wait(&cwt->cwt_cv, &cwt->cwt_lock);
1014		} while (cwt->cwt_state != CWT_STOPPED);
1015		mtx_unlock(&cwt->cwt_lock);
1016	}
1017	free(cwt_softc, M_CXGBE);
1018}
1019
1020/* Select a worker thread for a connection. */
1021u_int
1022cxgbei_select_worker_thread(struct icl_cxgbei_conn *icc)
1023{
1024	struct adapter *sc = icc->sc;
1025	struct toepcb *toep = icc->toep;
1026	u_int i, n;
1027
1028	n = worker_thread_count / sc->sge.nofldrxq;
1029	if (n > 0)
1030		i = toep->vi->pi->port_id * n + arc4random() % n;
1031	else
1032		i = arc4random() % worker_thread_count;
1033
1034	CTR3(KTR_CXGBE, "%s: tid %u, cwt %u", __func__, toep->tid, i);
1035
1036	return (i);
1037}
1038
1039static int
1040cxgbei_mod_load(void)
1041{
1042	int rc;
1043
1044	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
1045	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
1046	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
1047
1048	rc = start_worker_threads();
1049	if (rc != 0)
1050		return (rc);
1051
1052	rc = t4_register_uld(&cxgbei_uld_info);
1053	if (rc != 0) {
1054		stop_worker_threads();
1055		return (rc);
1056	}
1057
1058	t4_iterate(cxgbei_activate_all, NULL);
1059
1060	return (rc);
1061}
1062
1063static int
1064cxgbei_mod_unload(void)
1065{
1066
1067	t4_iterate(cxgbei_deactivate_all, NULL);
1068
1069	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
1070		return (EBUSY);
1071
1072	stop_worker_threads();
1073
1074	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
1075	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
1076	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
1077
1078	return (0);
1079}
1080#endif
1081
1082static int
1083cxgbei_modevent(module_t mod, int cmd, void *arg)
1084{
1085	int rc = 0;
1086
1087#ifdef TCP_OFFLOAD
1088	switch (cmd) {
1089	case MOD_LOAD:
1090		rc = cxgbei_mod_load();
1091		break;
1092
1093	case MOD_UNLOAD:
1094		rc = cxgbei_mod_unload();
1095		break;
1096
1097	default:
1098		rc = EINVAL;
1099	}
1100#else
1101	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
1102	rc = EOPNOTSUPP;
1103#endif
1104
1105	return (rc);
1106}
1107
1108static moduledata_t cxgbei_mod = {
1109	"cxgbei",
1110	cxgbei_modevent,
1111	NULL,
1112};
1113
1114MODULE_VERSION(cxgbei, 1);
1115DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1116MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
1117MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
1118MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
1119