1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 *
5 * Chelsio T5xx iSCSI driver
6 *
7 * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32#include "opt_inet.h"
33#include "opt_inet6.h"
34
35#include <sys/types.h>
36#include <sys/param.h>
37#include <sys/kernel.h>
38#include <sys/ktr.h>
39#include <sys/module.h>
40#include <sys/systm.h>
41
42#ifdef TCP_OFFLOAD
43#include <sys/errno.h>
44#include <sys/gsb_crc32.h>
45#include <sys/kthread.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/mbuf.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/condvar.h>
53#include <sys/uio.h>
54
55#include <netinet/in.h>
56#include <netinet/in_pcb.h>
57#include <netinet/toecore.h>
58#include <netinet/tcp_var.h>
59#include <netinet/tcp_fsm.h>
60
61#include <cam/scsi/scsi_all.h>
62#include <cam/scsi/scsi_da.h>
63#include <cam/ctl/ctl_io.h>
64#include <cam/ctl/ctl.h>
65#include <cam/ctl/ctl_backend.h>
66#include <cam/ctl/ctl_error.h>
67#include <cam/ctl/ctl_frontend.h>
68#include <cam/ctl/ctl_debug.h>
69#include <cam/ctl/ctl_ha.h>
70#include <cam/ctl/ctl_ioctl.h>
71
72#include <dev/iscsi/icl.h>
73#include <dev/iscsi/iscsi_proto.h>
74#include <dev/iscsi/iscsi_ioctl.h>
75#include <dev/iscsi/iscsi.h>
76#include <cam/ctl/ctl_frontend_iscsi.h>
77
78#include <cam/cam.h>
79#include <cam/cam_ccb.h>
80#include <cam/cam_xpt.h>
81#include <cam/cam_debug.h>
82#include <cam/cam_sim.h>
83#include <cam/cam_xpt_sim.h>
84#include <cam/cam_xpt_periph.h>
85#include <cam/cam_periph.h>
86#include <cam/cam_compat.h>
87#include <cam/scsi/scsi_message.h>
88
89#include "common/common.h"
90#include "common/t4_msg.h"
91#include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
92#include "tom/t4_tom.h"
93#include "cxgbei.h"
94
95static void
96read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
97    uint32_t *max_rx_data_len, struct ppod_region *pr)
98{
99	uint32_t tx_len, rx_len, r, v;
100
101	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
102	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
103
104	r = t4_read_reg(sc, A_TP_PARA_REG2);
105	rx_len = min(rx_len, G_MAXRXDATA(r));
106	tx_len = min(tx_len, G_MAXRXDATA(r));
107
108	r = t4_read_reg(sc, A_TP_PARA_REG7);
109	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
110	rx_len = min(rx_len, v);
111	tx_len = min(tx_len, v);
112
113	/*
114	 * AHS is not supported by the kernel so we'll not account for
115	 * it either in our PDU len -> data segment len conversions.
116	 */
117	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
118	    ISCSI_DATA_DIGEST_SIZE;
119	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
120	    ISCSI_DATA_DIGEST_SIZE;
121
122	/*
123	 * DDP can place only 4 pages for a single PDU.  A single
124	 * request might use larger pages than the smallest page size,
125	 * but that cannot be guaranteed.  Assume the smallest DDP
126	 * page size for this limit.
127	 */
128	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
129
130	if (chip_id(sc) == CHELSIO_T5) {
131		tx_len = min(tx_len, 15360);
132
133		rx_len = rounddown2(rx_len, 512);
134		tx_len = rounddown2(tx_len, 512);
135	}
136
137	*max_tx_data_len = tx_len;
138	*max_rx_data_len = rx_len;
139}
140
141/*
142 * Initialize the software state of the iSCSI ULP driver.
143 *
144 * ENXIO means firmware didn't set up something that it was supposed to.
145 */
146static int
147cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
148{
149	struct sysctl_oid *oid;
150	struct sysctl_oid_list *children;
151	struct ppod_region *pr;
152	uint32_t r;
153	int rc;
154
155	MPASS(sc->vres.iscsi.size > 0);
156	MPASS(ci != NULL);
157
158	pr = &ci->pr;
159	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
160	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
161	if (rc != 0) {
162		device_printf(sc->dev,
163		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
164		    __func__, rc);
165		return (rc);
166	}
167
168	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
169
170	sysctl_ctx_init(&ci->ctx);
171	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
172	children = SYSCTL_CHILDREN(oid);
173
174	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
175	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
176	children = SYSCTL_CHILDREN(oid);
177
178	ci->ddp_threshold = 2048;
179	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
180	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
181
182	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
183	    CTLFLAG_RW, &ci->max_rx_data_len, 0,
184	    "Maximum receive data segment length");
185	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
186	    CTLFLAG_RW, &ci->max_tx_data_len, 0,
187	    "Maximum transmit data segment length");
188
189	return (0);
190}
191
192static int
193do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
194{
195	struct adapter *sc = iq->adapter;
196	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
197	u_int tid = GET_TID(cpl);
198	struct toepcb *toep = lookup_tid(sc, tid);
199	struct icl_pdu *ip;
200	struct icl_cxgbei_pdu *icp;
201	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
202	uint16_t len = be16toh(cpl->len);
203
204	M_ASSERTPKTHDR(m);
205	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
206
207	ip = icl_cxgbei_new_pdu(M_NOWAIT);
208	if (ip == NULL)
209		CXGBE_UNIMPLEMENTED("PDU allocation failure");
210	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
211	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
212	icp = ip_to_icp(ip);
213	icp->icp_seq = ntohl(cpl->seq);
214	icp->icp_flags = ICPF_RX_HDR;
215
216	/* This is the start of a new PDU.  There should be no old state. */
217	MPASS(toep->ulpcb2 == NULL);
218	toep->ulpcb2 = icp;
219
220#if 0
221	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
222	    __func__, tid, len, len_ddp, icp);
223#endif
224
225	m_freem(m);
226	return (0);
227}
228
229static int
230do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
231{
232	struct adapter *sc = iq->adapter;
233	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
234	u_int tid = GET_TID(cpl);
235	struct toepcb *toep = lookup_tid(sc, tid);
236	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
237	struct icl_pdu *ip;
238
239	M_ASSERTPKTHDR(m);
240	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
241
242	if (icp == NULL) {
243		/*
244		 * T6 completion enabled, start of a new pdu. Header
245		 * will come in completion CPL.
246		 */
247	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
248	        if (ip == NULL)
249			CXGBE_UNIMPLEMENTED("PDU allocation failure");
250		icp = ip_to_icp(ip);
251	} else {
252		/* T5 mode, header is already received. */
253		MPASS(icp->icp_flags == ICPF_RX_HDR);
254		MPASS(icp->ip.ip_data_mbuf == NULL);
255		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
256	}
257
258	/* Trim the cpl header from mbuf. */
259	m_adj(m, sizeof(*cpl));
260
261	icp->icp_flags |= ICPF_RX_FLBUF;
262	icp->ip.ip_data_mbuf = m;
263	toep->ofld_rxq->rx_iscsi_fl_pdus++;
264	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
265
266	/*
267	 * For T6, save the icp for further processing in the
268	 * completion handler.
269	 */
270	if (icp->icp_flags == ICPF_RX_FLBUF) {
271		MPASS(toep->ulpcb2 == NULL);
272		toep->ulpcb2 = icp;
273	}
274
275#if 0
276	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
277	    be16toh(cpl->len), icp);
278#endif
279
280	return (0);
281}
282
283static int
284mbuf_crc32c_helper(void *arg, void *data, u_int len)
285{
286	uint32_t *digestp = arg;
287
288	*digestp = calculate_crc32c(*digestp, data, len);
289	return (0);
290}
291
292static struct icl_pdu *
293parse_pdu(struct socket *so, struct toepcb *toep, struct icl_cxgbei_conn *icc,
294    struct sockbuf *sb, u_int total_len)
295{
296	struct uio uio;
297	struct iovec iov[2];
298	struct iscsi_bhs bhs;
299	struct mbuf *m;
300	struct icl_pdu *ip;
301	u_int ahs_len, data_len, header_len, pdu_len;
302	uint32_t calc_digest, wire_digest;
303	int error;
304
305	uio.uio_segflg = UIO_SYSSPACE;
306	uio.uio_rw = UIO_READ;
307	uio.uio_td = curthread;
308
309	header_len = sizeof(struct iscsi_bhs);
310	if (icc->ic.ic_header_crc32c)
311		header_len += ISCSI_HEADER_DIGEST_SIZE;
312
313	if (total_len < header_len) {
314		ICL_WARN("truncated pre-offload PDU with len %u", total_len);
315		return (NULL);
316	}
317
318	iov[0].iov_base = &bhs;
319	iov[0].iov_len = sizeof(bhs);
320	iov[1].iov_base = &wire_digest;
321	iov[1].iov_len = sizeof(wire_digest);
322	uio.uio_iov = iov;
323	uio.uio_iovcnt = 1;
324	uio.uio_offset = 0;
325	uio.uio_resid = header_len;
326	error = soreceive(so, NULL, &uio, NULL, NULL, NULL);
327	if (error != 0) {
328		ICL_WARN("failed to read BHS from pre-offload PDU: %d", error);
329		return (NULL);
330	}
331
332	ahs_len = bhs.bhs_total_ahs_len * 4;
333	data_len = bhs.bhs_data_segment_len[0] << 16 |
334	    bhs.bhs_data_segment_len[1] << 8 |
335	    bhs.bhs_data_segment_len[2];
336	pdu_len = header_len + ahs_len + roundup2(data_len, 4);
337	if (icc->ic.ic_data_crc32c && data_len != 0)
338		pdu_len += ISCSI_DATA_DIGEST_SIZE;
339
340	if (total_len < pdu_len) {
341		ICL_WARN("truncated pre-offload PDU len %u vs %u", total_len,
342		    pdu_len);
343		return (NULL);
344	}
345
346	if (ahs_len != 0) {
347		ICL_WARN("received pre-offload PDU with AHS");
348		return (NULL);
349	}
350
351	if (icc->ic.ic_header_crc32c) {
352		calc_digest = calculate_crc32c(0xffffffff, (caddr_t)&bhs,
353		    sizeof(bhs));
354		calc_digest ^= 0xffffffff;
355		if (calc_digest != wire_digest) {
356			ICL_WARN("received pre-offload PDU 0x%02x with "
357			    "invalid header digest (0x%x vs 0x%x)",
358			    bhs.bhs_opcode, wire_digest, calc_digest);
359			toep->ofld_rxq->rx_iscsi_header_digest_errors++;
360			return (NULL);
361		}
362	}
363
364	m = NULL;
365	if (data_len != 0) {
366		uio.uio_iov = NULL;
367		uio.uio_resid = roundup2(data_len, 4);
368		if (icc->ic.ic_data_crc32c)
369			uio.uio_resid += ISCSI_DATA_DIGEST_SIZE;
370
371		error = soreceive(so, NULL, &uio, &m, NULL, NULL);
372		if (error != 0) {
373			ICL_WARN("failed to read data payload from "
374			    "pre-offload PDU: %d", error);
375			return (NULL);
376		}
377
378		if (icc->ic.ic_data_crc32c) {
379			m_copydata(m, roundup2(data_len, 4),
380			    sizeof(wire_digest), (caddr_t)&wire_digest);
381
382			calc_digest = 0xffffffff;
383			m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
384			    &calc_digest);
385			calc_digest ^= 0xffffffff;
386			if (calc_digest != wire_digest) {
387				ICL_WARN("received pre-offload PDU 0x%02x "
388				    "with invalid data digest (0x%x vs 0x%x)",
389				    bhs.bhs_opcode, wire_digest, calc_digest);
390				toep->ofld_rxq->rx_iscsi_data_digest_errors++;
391				m_freem(m);
392				return (NULL);
393			}
394		}
395	}
396
397	ip = icl_cxgbei_new_pdu(M_WAITOK);
398	icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
399	*ip->ip_bhs = bhs;
400	ip->ip_data_len = data_len;
401	ip->ip_data_mbuf = m;
402	return (ip);
403}
404
405void
406parse_pdus(struct icl_cxgbei_conn *icc, struct sockbuf *sb)
407{
408	struct icl_conn *ic = &icc->ic;
409	struct socket *so = ic->ic_socket;
410	struct toepcb *toep = icc->toep;
411	struct icl_pdu *ip, *lastip;
412	u_int total_len;
413
414	SOCKBUF_LOCK_ASSERT(sb);
415
416	CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
417	    sbused(sb));
418
419	lastip = NULL;
420	while (sbused(sb) != 0 && (sb->sb_state & SBS_CANTRCVMORE) == 0) {
421		total_len = sbused(sb);
422		SOCKBUF_UNLOCK(sb);
423
424		ip = parse_pdu(so, toep, icc, sb, total_len);
425
426		if (ip == NULL) {
427			ic->ic_error(ic);
428			SOCKBUF_LOCK(sb);
429			return;
430		}
431
432		if (lastip == NULL)
433			STAILQ_INSERT_HEAD(&icc->rcvd_pdus, ip, ip_next);
434		else
435			STAILQ_INSERT_AFTER(&icc->rcvd_pdus, lastip, ip,
436			    ip_next);
437		lastip = ip;
438
439		SOCKBUF_LOCK(sb);
440	}
441}
442
443static int
444do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
445{
446	struct adapter *sc = iq->adapter;
447	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
448	u_int tid = GET_TID(cpl);
449	struct toepcb *toep = lookup_tid(sc, tid);
450	struct inpcb *inp = toep->inp;
451	struct socket *so;
452	struct sockbuf *sb;
453	struct tcpcb *tp;
454	struct icl_cxgbei_conn *icc;
455	struct icl_conn *ic;
456	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
457	struct icl_pdu *ip;
458	u_int pdu_len, val;
459	struct epoch_tracker et;
460
461	MPASS(m == NULL);
462
463	/* Must already be assembling a PDU. */
464	MPASS(icp != NULL);
465	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
466	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
467
468	pdu_len = be16toh(cpl->len);	/* includes everything. */
469	val = be32toh(cpl->ddpvld);
470
471#if 0
472	CTR5(KTR_CXGBE,
473	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
474	    __func__, tid, pdu_len, val, icp->icp_flags);
475#endif
476
477	icp->icp_flags |= ICPF_RX_STATUS;
478	ip = &icp->ip;
479	if (val & F_DDP_PADDING_ERR) {
480		ICL_WARN("received PDU 0x%02x with invalid padding",
481		    ip->ip_bhs->bhs_opcode);
482		toep->ofld_rxq->rx_iscsi_padding_errors++;
483	}
484	if (val & F_DDP_HDRCRC_ERR) {
485		ICL_WARN("received PDU 0x%02x with invalid header digest",
486		    ip->ip_bhs->bhs_opcode);
487		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
488	}
489	if (val & F_DDP_DATACRC_ERR) {
490		ICL_WARN("received PDU 0x%02x with invalid data digest",
491		    ip->ip_bhs->bhs_opcode);
492		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
493	}
494	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
495		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
496		MPASS(ip->ip_data_len > 0);
497		icp->icp_flags |= ICPF_RX_DDP;
498		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
499		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
500	}
501
502	INP_WLOCK(inp);
503	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
504		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
505		    __func__, tid, pdu_len, inp->inp_flags);
506		INP_WUNLOCK(inp);
507		icl_cxgbei_conn_pdu_free(NULL, ip);
508		toep->ulpcb2 = NULL;
509		return (0);
510	}
511
512	/*
513	 * T6+ does not report data PDUs received via DDP without F
514	 * set.  This can result in gaps in the TCP sequence space.
515	 */
516	tp = intotcpcb(inp);
517	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
518	tp->rcv_nxt = icp->icp_seq + pdu_len;
519	tp->t_rcvtime = ticks;
520
521	/*
522	 * Don't update the window size or return credits since RX
523	 * flow control is disabled.
524	 */
525
526	so = inp->inp_socket;
527	sb = &so->so_rcv;
528	SOCKBUF_LOCK(sb);
529
530	icc = toep->ulpcb;
531	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
532		CTR5(KTR_CXGBE,
533		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
534		    __func__, tid, pdu_len, icc, sb->sb_state);
535		SOCKBUF_UNLOCK(sb);
536		INP_WUNLOCK(inp);
537
538		CURVNET_SET(so->so_vnet);
539		NET_EPOCH_ENTER(et);
540		INP_WLOCK(inp);
541		tp = tcp_drop(tp, ECONNRESET);
542		if (tp)
543			INP_WUNLOCK(inp);
544		NET_EPOCH_EXIT(et);
545		CURVNET_RESTORE();
546
547		icl_cxgbei_conn_pdu_free(NULL, ip);
548		toep->ulpcb2 = NULL;
549		return (0);
550	}
551	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
552	ic = &icc->ic;
553	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
554	    F_DDP_DATACRC_ERR)) != 0) {
555		SOCKBUF_UNLOCK(sb);
556		INP_WUNLOCK(inp);
557
558		icl_cxgbei_conn_pdu_free(NULL, ip);
559		toep->ulpcb2 = NULL;
560		ic->ic_error(ic);
561		return (0);
562	}
563
564	icl_cxgbei_new_pdu_set_conn(ip, ic);
565
566	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
567	if (!icc->rx_active) {
568		icc->rx_active = true;
569		wakeup(&icc->rx_active);
570	}
571	SOCKBUF_UNLOCK(sb);
572	INP_WUNLOCK(inp);
573
574	toep->ulpcb2 = NULL;
575
576	return (0);
577}
578
579static int
580do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
581{
582	struct epoch_tracker et;
583	struct adapter *sc = iq->adapter;
584	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
585	u_int tid = GET_TID(cpl);
586	struct toepcb *toep = lookup_tid(sc, tid);
587	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
588	struct icl_pdu *ip;
589	struct cxgbei_cmp *cmp;
590	struct inpcb *inp = toep->inp;
591#ifdef INVARIANTS
592	uint16_t len = be16toh(cpl->len);
593	u_int data_digest_len;
594#endif
595	struct socket *so;
596	struct sockbuf *sb;
597	struct tcpcb *tp;
598	struct icl_cxgbei_conn *icc;
599	struct icl_conn *ic;
600	struct iscsi_bhs_data_out *bhsdo;
601	u_int val = be32toh(cpl->ddpvld);
602	u_int npdus, pdu_len;
603	uint32_t prev_seg_len;
604
605	M_ASSERTPKTHDR(m);
606	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
607
608	if ((val & F_DDP_PDU) == 0) {
609		MPASS(icp != NULL);
610		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
611		ip = &icp->ip;
612	}
613
614	if (icp == NULL) {
615		/* T6 completion enabled, start of a new PDU. */
616		ip = icl_cxgbei_new_pdu(M_NOWAIT);
617		if (ip == NULL)
618			CXGBE_UNIMPLEMENTED("PDU allocation failure");
619		icp = ip_to_icp(ip);
620	}
621	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
622
623#if 0
624	CTR5(KTR_CXGBE,
625	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
626	    __func__, tid, pdu_len, val, icp);
627#endif
628
629	/* Copy header */
630	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
631	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
632	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
633	    bhsdo->bhsdo_data_segment_len[1] << 8 |
634	    bhsdo->bhsdo_data_segment_len[2];
635	icp->icp_seq = ntohl(cpl->seq);
636	icp->icp_flags |= ICPF_RX_HDR;
637	icp->icp_flags |= ICPF_RX_STATUS;
638
639	if (val & F_DDP_PADDING_ERR) {
640		ICL_WARN("received PDU 0x%02x with invalid padding",
641		    ip->ip_bhs->bhs_opcode);
642		toep->ofld_rxq->rx_iscsi_padding_errors++;
643	}
644	if (val & F_DDP_HDRCRC_ERR) {
645		ICL_WARN("received PDU 0x%02x with invalid header digest",
646		    ip->ip_bhs->bhs_opcode);
647		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
648	}
649	if (val & F_DDP_DATACRC_ERR) {
650		ICL_WARN("received PDU 0x%02x with invalid data digest",
651		    ip->ip_bhs->bhs_opcode);
652		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
653	}
654
655	INP_WLOCK(inp);
656	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
657		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
658		    __func__, tid, pdu_len, inp->inp_flags);
659		INP_WUNLOCK(inp);
660		icl_cxgbei_conn_pdu_free(NULL, ip);
661		toep->ulpcb2 = NULL;
662		m_freem(m);
663		return (0);
664	}
665
666	tp = intotcpcb(inp);
667
668	/*
669	 * If icc is NULL, the connection is being closed in
670	 * icl_cxgbei_conn_close(), just drop this data.
671	 */
672	icc = toep->ulpcb;
673	if (__predict_false(icc == NULL)) {
674		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
675		    __func__, tid, pdu_len, icc);
676
677		/*
678		 * Update rcv_nxt so the sequence number of the FIN
679		 * doesn't appear wrong.
680		 */
681		tp->rcv_nxt = icp->icp_seq + pdu_len;
682		tp->t_rcvtime = ticks;
683		INP_WUNLOCK(inp);
684
685		icl_cxgbei_conn_pdu_free(NULL, ip);
686		toep->ulpcb2 = NULL;
687		m_freem(m);
688		return (0);
689	}
690
691	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
692	ic = &icc->ic;
693	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
694	    F_DDP_DATACRC_ERR)) != 0) {
695		INP_WUNLOCK(inp);
696
697		icl_cxgbei_conn_pdu_free(NULL, ip);
698		toep->ulpcb2 = NULL;
699		m_freem(m);
700		ic->ic_error(ic);
701		return (0);
702	}
703
704#ifdef INVARIANTS
705	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
706	    ISCSI_DATA_DIGEST_SIZE : 0;
707	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
708#endif
709
710	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
711		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
712		MPASS(ip->ip_data_len > 0);
713		icp->icp_flags |= ICPF_RX_DDP;
714		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
715
716		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
717		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
718			cmp = cxgbei_find_cmp(icc,
719			    be32toh(bhsdo->bhsdo_initiator_task_tag));
720			break;
721		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
722			cmp = cxgbei_find_cmp(icc,
723			    be32toh(bhsdo->bhsdo_target_transfer_tag));
724			break;
725		default:
726			__assert_unreachable();
727		}
728		MPASS(cmp != NULL);
729
730		/*
731		 * The difference between the end of the last burst
732		 * and the offset of the last PDU in this burst is
733		 * the additional data received via DDP.
734		 */
735		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
736		    cmp->next_buffer_offset;
737
738		if (prev_seg_len != 0) {
739			uint32_t orig_datasn;
740
741			/*
742			 * Return a "large" PDU representing the burst
743			 * of PDUs.  Adjust the offset and length of
744			 * this PDU to represent the entire burst.
745			 */
746			ip->ip_data_len += prev_seg_len;
747			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
748			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
749			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
750			bhsdo->bhsdo_buffer_offset =
751			    htobe32(cmp->next_buffer_offset);
752
753			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
754			npdus = orig_datasn - cmp->last_datasn;
755			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
756			cmp->last_datasn = orig_datasn;
757			ip->ip_additional_pdus = npdus - 1;
758		} else {
759			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
760			    cmp->last_datasn + 1);
761			npdus = 1;
762			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
763		}
764
765		cmp->next_buffer_offset += ip->ip_data_len;
766		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
767		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
768	} else {
769		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
770		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
771	}
772
773	tp->rcv_nxt = icp->icp_seq + pdu_len;
774	tp->t_rcvtime = ticks;
775
776	/*
777	 * Don't update the window size or return credits since RX
778	 * flow control is disabled.
779	 */
780
781	so = inp->inp_socket;
782	sb = &so->so_rcv;
783	SOCKBUF_LOCK(sb);
784	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
785		CTR5(KTR_CXGBE,
786		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
787		    __func__, tid, pdu_len, icc, sb->sb_state);
788		SOCKBUF_UNLOCK(sb);
789		INP_WUNLOCK(inp);
790
791		CURVNET_SET(so->so_vnet);
792		NET_EPOCH_ENTER(et);
793		INP_WLOCK(inp);
794		tp = tcp_drop(tp, ECONNRESET);
795		if (tp != NULL)
796			INP_WUNLOCK(inp);
797		NET_EPOCH_EXIT(et);
798		CURVNET_RESTORE();
799
800		icl_cxgbei_conn_pdu_free(NULL, ip);
801		toep->ulpcb2 = NULL;
802		m_freem(m);
803		return (0);
804	}
805
806	icl_cxgbei_new_pdu_set_conn(ip, ic);
807
808	/* Enqueue the PDU to the received pdus queue. */
809	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
810	if (!icc->rx_active) {
811		icc->rx_active = true;
812		wakeup(&icc->rx_active);
813	}
814	SOCKBUF_UNLOCK(sb);
815	INP_WUNLOCK(inp);
816
817	toep->ulpcb2 = NULL;
818	m_freem(m);
819
820	return (0);
821}
822
823static int
824cxgbei_activate(struct adapter *sc)
825{
826	struct cxgbei_data *ci;
827	int rc;
828
829	ASSERT_SYNCHRONIZED_OP(sc);
830
831	if (uld_active(sc, ULD_ISCSI)) {
832		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
833		    __func__, sc));
834		return (0);
835	}
836
837	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
838		device_printf(sc->dev,
839		    "not iSCSI offload capable, or capability disabled.\n");
840		return (ENOSYS);
841	}
842
843	/* per-adapter softc for iSCSI */
844	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
845	if (ci == NULL)
846		return (ENOMEM);
847
848	rc = cxgbei_init(sc, ci);
849	if (rc != 0) {
850		free(ci, M_CXGBE);
851		return (rc);
852	}
853
854	sc->iscsi_ulp_softc = ci;
855
856	return (0);
857}
858
859static int
860cxgbei_deactivate(struct adapter *sc)
861{
862	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
863
864	ASSERT_SYNCHRONIZED_OP(sc);
865
866	if (ci != NULL) {
867		sysctl_ctx_free(&ci->ctx);
868		t4_free_ppod_region(&ci->pr);
869		free(ci, M_CXGBE);
870		sc->iscsi_ulp_softc = NULL;
871	}
872
873	return (0);
874}
875
876static void
877cxgbei_activate_all(struct adapter *sc, void *arg __unused)
878{
879
880	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
881		return;
882
883	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
884	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
885		(void) t4_activate_uld(sc, ULD_ISCSI);
886
887	end_synchronized_op(sc, 0);
888}
889
890static void
891cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
892{
893
894	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
895		return;
896
897	if (uld_active(sc, ULD_ISCSI))
898	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
899
900	end_synchronized_op(sc, 0);
901}
902
903static struct uld_info cxgbei_uld_info = {
904	.uld_id = ULD_ISCSI,
905	.activate = cxgbei_activate,
906	.deactivate = cxgbei_deactivate,
907};
908
909static int
910cxgbei_mod_load(void)
911{
912	int rc;
913
914	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
915	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
916	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
917	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
918
919	rc = t4_register_uld(&cxgbei_uld_info);
920	if (rc != 0)
921		return (rc);
922
923	t4_iterate(cxgbei_activate_all, NULL);
924
925	return (rc);
926}
927
928static int
929cxgbei_mod_unload(void)
930{
931
932	t4_iterate(cxgbei_deactivate_all, NULL);
933
934	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
935		return (EBUSY);
936
937	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
938	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
939	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
940	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
941
942	return (0);
943}
944#endif
945
946static int
947cxgbei_modevent(module_t mod, int cmd, void *arg)
948{
949	int rc = 0;
950
951#ifdef TCP_OFFLOAD
952	switch (cmd) {
953	case MOD_LOAD:
954		rc = cxgbei_mod_load();
955		if (rc == 0)
956			rc = icl_cxgbei_mod_load();
957		break;
958
959	case MOD_UNLOAD:
960		rc = icl_cxgbei_mod_unload();
961		if (rc == 0)
962			rc = cxgbei_mod_unload();
963		break;
964
965	default:
966		rc = EINVAL;
967	}
968#else
969	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
970	rc = EOPNOTSUPP;
971#endif
972
973	return (rc);
974}
975
976static moduledata_t cxgbei_mod = {
977	"cxgbei",
978	cxgbei_modevent,
979	NULL,
980};
981
982MODULE_VERSION(cxgbei, 1);
983DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
984MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
985MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
986MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
987