1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8#include <sys/endian.h>
9#include <sys/gsb_crc32.h>
10#include <sys/queue.h>
11#include <sys/uio.h>
12#include <assert.h>
13#include <errno.h>
14#include <stdio.h>
15#include <stdlib.h>
16#include <string.h>
17#include <unistd.h>
18
19#include "libnvmf.h"
20#include "internal.h"
21#include "nvmf_tcp.h"
22
23struct nvmf_tcp_qpair;
24
25struct nvmf_tcp_command_buffer {
26	struct nvmf_tcp_qpair *qp;
27
28	void	*data;
29	size_t	data_len;
30	size_t	data_xfered;
31	uint32_t data_offset;
32
33	uint16_t cid;
34	uint16_t ttag;
35
36	LIST_ENTRY(nvmf_tcp_command_buffer) link;
37};
38
39LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer);
40
41struct nvmf_tcp_association {
42	struct nvmf_association na;
43
44	uint32_t ioccsz;
45};
46
47struct nvmf_tcp_rxpdu {
48	struct nvme_tcp_common_pdu_hdr *hdr;
49	uint32_t data_len;
50};
51
52struct nvmf_tcp_capsule {
53	struct nvmf_capsule nc;
54
55	struct nvmf_tcp_rxpdu rx_pdu;
56	struct nvmf_tcp_command_buffer *cb;
57
58	TAILQ_ENTRY(nvmf_tcp_capsule) link;
59};
60
61struct nvmf_tcp_qpair {
62	struct nvmf_qpair qp;
63	int s;
64
65	uint8_t	txpda;
66	uint8_t rxpda;
67	bool header_digests;
68	bool data_digests;
69	uint32_t maxr2t;
70	uint32_t maxh2cdata;
71	uint32_t max_icd;	/* Host only */
72	uint16_t next_ttag;	/* Controller only */
73
74	struct nvmf_tcp_command_buffer_list tx_buffers;
75	struct nvmf_tcp_command_buffer_list rx_buffers;
76	TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules;
77};
78
79#define	TASSOC(nc)	((struct nvmf_tcp_association *)(na))
80#define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
81#define	CTCAP(nc)	((const struct nvmf_tcp_capsule *)(nc))
82#define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
83
84static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET];
85
86static uint32_t
87compute_digest(const void *buf, size_t len)
88{
89	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
90}
91
92static struct nvmf_tcp_command_buffer *
93tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data,
94    uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag,
95    bool receive)
96{
97	struct nvmf_tcp_command_buffer *cb;
98
99	cb = malloc(sizeof(*cb));
100	cb->qp = qp;
101	cb->data = data;
102	cb->data_offset = data_offset;
103	cb->data_len = data_len;
104	cb->data_xfered = 0;
105	cb->cid = cid;
106	cb->ttag = ttag;
107
108	if (receive)
109		LIST_INSERT_HEAD(&qp->rx_buffers, cb, link);
110	else
111		LIST_INSERT_HEAD(&qp->tx_buffers, cb, link);
112	return (cb);
113}
114
115static struct nvmf_tcp_command_buffer *
116tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
117    bool receive)
118{
119	struct nvmf_tcp_command_buffer_list *list;
120	struct nvmf_tcp_command_buffer *cb;
121
122	list = receive ? &qp->rx_buffers : &qp->tx_buffers;
123	LIST_FOREACH(cb, list, link) {
124		if (cb->cid == cid && cb->ttag == ttag)
125			return (cb);
126	}
127	return (NULL);
128}
129
130static void
131tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
132    bool receive)
133{
134	struct nvmf_tcp_command_buffer *cb;
135
136	cb = tcp_find_command_buffer(qp, cid, ttag, receive);
137	if (cb != NULL)
138		LIST_REMOVE(cb, link);
139}
140
141static void
142tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
143{
144	LIST_REMOVE(cb, link);
145	free(cb);
146}
147
148static int
149nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len)
150{
151	ssize_t nwritten;
152	const char *cp;
153
154	cp = pdu;
155	while (len != 0) {
156		nwritten = write(qp->s, cp, len);
157		if (nwritten < 0)
158			return (errno);
159		len -= nwritten;
160		cp += nwritten;
161	}
162	return (0);
163}
164
165static int
166nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov,
167    u_int iovcnt, size_t len)
168{
169	ssize_t nwritten;
170
171	for (;;) {
172		nwritten = writev(qp->s, iov, iovcnt);
173		if (nwritten < 0)
174			return (errno);
175
176		len -= nwritten;
177		if (len == 0)
178			return (0);
179
180		while (iov->iov_len <= (size_t)nwritten) {
181			nwritten -= iov->iov_len;
182			iovcnt--;
183			iov++;
184		}
185
186		iov->iov_base = (char *)iov->iov_base + nwritten;
187		iov->iov_len -= nwritten;
188	}
189}
190
191static void
192nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
193    uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen)
194{
195	struct nvme_tcp_term_req_hdr hdr;
196	struct iovec iov[2];
197
198	if (hlen != 0) {
199		if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE)
200			hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
201		if (hlen > pdu_len)
202			hlen = pdu_len;
203	}
204
205	memset(&hdr, 0, sizeof(hdr));
206	hdr.common.pdu_type = na->na_controller ?
207	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
208	hdr.common.hlen = sizeof(hdr);
209	hdr.common.plen = sizeof(hdr) + hlen;
210	hdr.fes = htole16(fes);
211	le32enc(hdr.fei, fei);
212	iov[0].iov_base = &hdr;
213	iov[0].iov_len = sizeof(hdr);
214	iov[1].iov_base = __DECONST(void *, rx_pdu);
215	iov[1].iov_len = hlen;
216
217	(void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen);
218	close(qp->s);
219	qp->s = -1;
220}
221
222static int
223nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu,
224    size_t pdu_len)
225{
226	const struct nvme_tcp_common_pdu_hdr *ch;
227	uint32_t data_len, fei, plen;
228	uint32_t digest, rx_digest;
229	u_int hlen;
230	int error;
231	uint16_t fes;
232
233	/* Determine how large of a PDU header to return for errors. */
234	ch = pdu->hdr;
235	hlen = ch->hlen;
236	plen = le32toh(ch->plen);
237	if (hlen < sizeof(*ch) || hlen > plen)
238		hlen = sizeof(*ch);
239
240	error = nvmf_tcp_validate_pdu_header(ch,
241	    qp->qp.nq_association->na_controller, qp->header_digests,
242	    qp->data_digests, qp->rxpda, &data_len, &fes, &fei);
243	if (error != 0) {
244		if (error == ECONNRESET) {
245			close(qp->s);
246			qp->s = -1;
247		} else {
248			nvmf_tcp_report_error(qp->qp.nq_association, qp,
249			    fes, fei, ch, pdu_len, hlen);
250		}
251		return (error);
252	}
253
254	/* Check header digest if present. */
255	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
256		digest = compute_digest(ch, ch->hlen);
257		memcpy(&rx_digest, (const char *)ch + ch->hlen,
258		    sizeof(rx_digest));
259		if (digest != rx_digest) {
260			printf("NVMe/TCP: Header digest mismatch\n");
261			nvmf_tcp_report_error(qp->qp.nq_association, qp,
262			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch,
263			    pdu_len, hlen);
264			return (EBADMSG);
265		}
266	}
267
268	/* Check data digest if present. */
269	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
270		digest = compute_digest((const char *)ch + ch->pdo, data_len);
271		memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest),
272		    sizeof(rx_digest));
273		if (digest != rx_digest) {
274			printf("NVMe/TCP: Data digest mismatch\n");
275			return (EBADMSG);
276		}
277	}
278
279	pdu->data_len = data_len;
280	return (0);
281}
282
283/*
284 * Read data from a socket, retrying until the data has been fully
285 * read or an error occurs.
286 */
287static int
288nvmf_tcp_read_buffer(int s, void *buf, size_t len)
289{
290	ssize_t nread;
291	char *cp;
292
293	cp = buf;
294	while (len != 0) {
295		nread = read(s, cp, len);
296		if (nread < 0)
297			return (errno);
298		if (nread == 0)
299			return (ECONNRESET);
300		len -= nread;
301		cp += nread;
302	}
303	return (0);
304}
305
306static int
307nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
308{
309	struct nvme_tcp_common_pdu_hdr ch;
310	uint32_t plen;
311	int error;
312
313	memset(pdu, 0, sizeof(*pdu));
314	error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch));
315	if (error != 0)
316		return (error);
317
318	plen = le32toh(ch.plen);
319
320	/*
321	 * Validate a header with garbage lengths to trigger
322	 * an error message without reading more.
323	 */
324	if (plen < sizeof(ch) || ch.hlen > plen) {
325		pdu->hdr = &ch;
326		error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch));
327		pdu->hdr = NULL;
328		assert(error != 0);
329		return (error);
330	}
331
332	/* Read the rest of the PDU. */
333	pdu->hdr = malloc(plen);
334	memcpy(pdu->hdr, &ch, sizeof(ch));
335	error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch));
336	if (error != 0)
337		return (error);
338	error = nvmf_tcp_validate_pdu(qp, pdu, plen);
339	if (error != 0) {
340		free(pdu->hdr);
341		pdu->hdr = NULL;
342	}
343	return (error);
344}
345
346static void
347nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
348{
349	free(pdu->hdr);
350	pdu->hdr = NULL;
351}
352
353static int
354nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
355{
356	struct nvme_tcp_term_req_hdr *hdr;
357
358	hdr = (void *)pdu->hdr;
359
360	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
361	    le16toh(hdr->fes), le32dec(hdr->fei));
362	nvmf_tcp_free_pdu(pdu);
363	return (ECONNRESET);
364}
365
366static int
367nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
368    struct nvmf_tcp_rxpdu *pdu)
369{
370	struct nvme_tcp_cmd *cmd;
371	struct nvmf_capsule *nc;
372	struct nvmf_tcp_capsule *tc;
373
374	cmd = (void *)pdu->hdr;
375
376	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe);
377	if (nc == NULL)
378		return (ENOMEM);
379
380	tc = TCAP(nc);
381	tc->rx_pdu = *pdu;
382
383	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
384	return (0);
385}
386
387static int
388nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
389    struct nvmf_tcp_rxpdu *pdu)
390{
391	struct nvme_tcp_rsp *rsp;
392	struct nvmf_capsule *nc;
393	struct nvmf_tcp_capsule *tc;
394
395	rsp = (void *)pdu->hdr;
396
397	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe);
398	if (nc == NULL)
399		return (ENOMEM);
400
401	nc->nc_sqhd_valid = true;
402	tc = TCAP(nc);
403	tc->rx_pdu = *pdu;
404
405	TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
406
407	/*
408	 * Once the CQE has been received, no further transfers to the
409	 * command buffer for the associated CID can occur.
410	 */
411	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true);
412	tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false);
413
414	return (0);
415}
416
417/*
418 * Construct and send a PDU that contains an optional data payload.
419 * This includes dealing with digests and the length fields in the
420 * common header.
421 */
422static int
423nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
424    void *data, uint32_t data_len)
425{
426	struct nvme_tcp_common_pdu_hdr *ch;
427	struct iovec iov[5];
428	u_int iovcnt;
429	uint32_t header_digest, data_digest, pad, pdo, plen;
430
431	plen = hlen;
432	if (qp->header_digests)
433		plen += sizeof(header_digest);
434	if (data_len != 0) {
435		pdo = roundup2(plen, qp->txpda);
436		pad = pdo - plen;
437		plen = pdo + data_len;
438		if (qp->data_digests)
439			plen += sizeof(data_digest);
440	} else {
441		assert(data == NULL);
442		pdo = 0;
443		pad = 0;
444	}
445
446	ch = hdr;
447	ch->hlen = hlen;
448	if (qp->header_digests)
449		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
450	if (qp->data_digests && data_len != 0)
451		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
452	ch->pdo = pdo;
453	ch->plen = htole32(plen);
454
455	/* CH + PSH */
456	iov[0].iov_base = hdr;
457	iov[0].iov_len = hlen;
458	iovcnt = 1;
459
460	/* HDGST */
461	if (qp->header_digests) {
462		header_digest = compute_digest(hdr, hlen);
463		iov[iovcnt].iov_base = &header_digest;
464		iov[iovcnt].iov_len = sizeof(header_digest);
465		iovcnt++;
466	}
467
468	if (pad != 0) {
469		/* PAD */
470		iov[iovcnt].iov_base = __DECONST(char *, zero_padding);
471		iov[iovcnt].iov_len = pad;
472		iovcnt++;
473	}
474
475	if (data_len != 0) {
476		/* DATA */
477		iov[iovcnt].iov_base = data;
478		iov[iovcnt].iov_len = data_len;
479		iovcnt++;
480
481		/* DDGST */
482		if (qp->data_digests) {
483			data_digest = compute_digest(data, data_len);
484			iov[iovcnt].iov_base = &data_digest;
485			iov[iovcnt].iov_len = sizeof(data_digest);
486			iovcnt++;
487		}
488	}
489
490	return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen));
491}
492
493static int
494nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
495{
496	struct nvme_tcp_h2c_data_hdr *h2c;
497	struct nvmf_tcp_command_buffer *cb;
498	uint32_t data_len, data_offset;
499	const char *icd;
500
501	h2c = (void *)pdu->hdr;
502	if (le32toh(h2c->datal) > qp->maxh2cdata) {
503		nvmf_tcp_report_error(qp->qp.nq_association, qp,
504		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
505		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
506		nvmf_tcp_free_pdu(pdu);
507		return (EBADMSG);
508	}
509
510	cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true);
511	if (cb == NULL) {
512		nvmf_tcp_report_error(qp->qp.nq_association, qp,
513		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
514		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr,
515		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
516		nvmf_tcp_free_pdu(pdu);
517		return (EBADMSG);
518	}
519
520	data_len = le32toh(h2c->datal);
521	if (data_len != pdu->data_len) {
522		nvmf_tcp_report_error(qp->qp.nq_association, qp,
523		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
524		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr,
525		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
526		nvmf_tcp_free_pdu(pdu);
527		return (EBADMSG);
528	}
529
530	data_offset = le32toh(h2c->datao);
531	if (data_offset < cb->data_offset ||
532	    data_offset + data_len > cb->data_offset + cb->data_len) {
533		nvmf_tcp_report_error(qp->qp.nq_association, qp,
534		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
535		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
536		nvmf_tcp_free_pdu(pdu);
537		return (EBADMSG);
538	}
539
540	if (data_offset != cb->data_offset + cb->data_xfered) {
541		nvmf_tcp_report_error(qp->qp.nq_association, qp,
542		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
543		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
544		nvmf_tcp_free_pdu(pdu);
545		return (EBADMSG);
546	}
547
548	if ((cb->data_xfered + data_len == cb->data_len) !=
549	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
550		nvmf_tcp_report_error(qp->qp.nq_association, qp,
551		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
552		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
553		nvmf_tcp_free_pdu(pdu);
554		return (EBADMSG);
555	}
556
557	cb->data_xfered += data_len;
558	data_offset -= cb->data_offset;
559	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
560	memcpy((char *)cb->data + data_offset, icd, data_len);
561
562	nvmf_tcp_free_pdu(pdu);
563	return (0);
564}
565
566static int
567nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
568{
569	struct nvme_tcp_c2h_data_hdr *c2h;
570	struct nvmf_tcp_command_buffer *cb;
571	uint32_t data_len, data_offset;
572	const char *icd;
573
574	c2h = (void *)pdu->hdr;
575
576	cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true);
577	if (cb == NULL) {
578		/*
579		 * XXX: Could be PDU sequence error if cccid is for a
580		 * command that doesn't use a command buffer.
581		 */
582		nvmf_tcp_report_error(qp->qp.nq_association, qp,
583		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
584		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr,
585		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
586		nvmf_tcp_free_pdu(pdu);
587		return (EBADMSG);
588	}
589
590	data_len = le32toh(c2h->datal);
591	if (data_len != pdu->data_len) {
592		nvmf_tcp_report_error(qp->qp.nq_association, qp,
593		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
594		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr,
595		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
596		nvmf_tcp_free_pdu(pdu);
597		return (EBADMSG);
598	}
599
600	data_offset = le32toh(c2h->datao);
601	if (data_offset < cb->data_offset ||
602	    data_offset + data_len > cb->data_offset + cb->data_len) {
603		nvmf_tcp_report_error(qp->qp.nq_association, qp,
604		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
605		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
606		nvmf_tcp_free_pdu(pdu);
607		return (EBADMSG);
608	}
609
610	if (data_offset != cb->data_offset + cb->data_xfered) {
611		nvmf_tcp_report_error(qp->qp.nq_association, qp,
612		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
613		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
614		nvmf_tcp_free_pdu(pdu);
615		return (EBADMSG);
616	}
617
618	if ((cb->data_xfered + data_len == cb->data_len) !=
619	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
620		nvmf_tcp_report_error(qp->qp.nq_association, qp,
621		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
622		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
623		nvmf_tcp_free_pdu(pdu);
624		return (EBADMSG);
625	}
626
627	cb->data_xfered += data_len;
628	data_offset -= cb->data_offset;
629	icd = (const char *)pdu->hdr + pdu->hdr->pdo;
630	memcpy((char *)cb->data + data_offset, icd, data_len);
631
632	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
633		struct nvme_completion cqe;
634		struct nvmf_tcp_capsule *tc;
635		struct nvmf_capsule *nc;
636
637		memset(&cqe, 0, sizeof(cqe));
638		cqe.cid = cb->cid;
639
640		nc = nvmf_allocate_response(&qp->qp, &cqe);
641		if (nc == NULL) {
642			nvmf_tcp_free_pdu(pdu);
643			return (ENOMEM);
644		}
645		nc->nc_sqhd_valid = false;
646
647		tc = TCAP(nc);
648		TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link);
649	}
650
651	nvmf_tcp_free_pdu(pdu);
652	return (0);
653}
654
655/* NB: cid and ttag and little-endian already. */
656static int
657tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
658    uint32_t data_offset, void *buf, size_t len, bool last_pdu)
659{
660	struct nvme_tcp_h2c_data_hdr h2c;
661
662	memset(&h2c, 0, sizeof(h2c));
663	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
664	if (last_pdu)
665		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
666	h2c.cccid = cid;
667	h2c.ttag = ttag;
668	h2c.datao = htole32(data_offset);
669	h2c.datal = htole32(len);
670
671	return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len));
672}
673
674/* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */
675static int
676tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
677    uint32_t data_offset, void *buf, size_t len, bool last_pdu)
678{
679	char *p;
680
681	p = buf;
682	while (len != 0) {
683		size_t todo;
684		int error;
685
686		todo = len;
687		if (todo > qp->maxh2cdata)
688			todo = qp->maxh2cdata;
689		error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo,
690		    last_pdu && todo == len);
691		if (error != 0)
692			return (error);
693		p += todo;
694		len -= todo;
695	}
696	return (0);
697}
698
699static int
700nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
701{
702	struct nvmf_tcp_command_buffer *cb;
703	struct nvme_tcp_r2t_hdr *r2t;
704	uint32_t data_len, data_offset;
705	int error;
706
707	r2t = (void *)pdu->hdr;
708
709	cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false);
710	if (cb == NULL) {
711		nvmf_tcp_report_error(qp->qp.nq_association, qp,
712		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
713		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr,
714		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
715		nvmf_tcp_free_pdu(pdu);
716		return (EBADMSG);
717	}
718
719	data_offset = le32toh(r2t->r2to);
720	if (data_offset != cb->data_xfered) {
721		nvmf_tcp_report_error(qp->qp.nq_association, qp,
722		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr,
723		    le32toh(pdu->hdr->plen), pdu->hdr->hlen);
724		nvmf_tcp_free_pdu(pdu);
725		return (EBADMSG);
726	}
727
728	/*
729	 * XXX: The spec does not specify how to handle R2T tranfers
730	 * out of range of the original command.
731	 */
732	data_len = le32toh(r2t->r2tl);
733	if (data_offset + data_len > cb->data_len) {
734		nvmf_tcp_report_error(qp->qp.nq_association, qp,
735		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
736		    pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen);
737		nvmf_tcp_free_pdu(pdu);
738		return (EBADMSG);
739	}
740
741	cb->data_xfered += data_len;
742
743	/*
744	 * Write out one or more H2C_DATA PDUs containing the
745	 * requested data.
746	 */
747	error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag,
748	    data_offset, (char *)cb->data + data_offset, data_len, true);
749
750	nvmf_tcp_free_pdu(pdu);
751	return (error);
752}
753
754static int
755nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp)
756{
757	struct nvmf_tcp_rxpdu pdu;
758	int error;
759
760	error = nvmf_tcp_read_pdu(qp, &pdu);
761	if (error != 0)
762		return (error);
763
764	switch (pdu.hdr->pdu_type) {
765	default:
766		__unreachable();
767		break;
768	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
769	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
770		return (nvmf_tcp_handle_term_req(&pdu));
771	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
772		return (nvmf_tcp_save_command_capsule(qp, &pdu));
773	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
774		return (nvmf_tcp_save_response_capsule(qp, &pdu));
775	case NVME_TCP_PDU_TYPE_H2C_DATA:
776		return (nvmf_tcp_handle_h2c_data(qp, &pdu));
777	case NVME_TCP_PDU_TYPE_C2H_DATA:
778		return (nvmf_tcp_handle_c2h_data(qp, &pdu));
779	case NVME_TCP_PDU_TYPE_R2T:
780		return (nvmf_tcp_handle_r2t(qp, &pdu));
781	}
782}
783
784static bool
785nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
786    const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len)
787{
788	const struct nvme_tcp_ic_req *pdu;
789	uint32_t plen;
790	u_int hlen;
791
792	/* Determine how large of a PDU header to return for errors. */
793	hlen = ch->hlen;
794	plen = le32toh(ch->plen);
795	if (hlen < sizeof(*ch) || hlen > plen)
796		hlen = sizeof(*ch);
797
798	/*
799	 * Errors must be reported for the lowest incorrect field
800	 * first, so validate fields in order.
801	 */
802
803	/* Validate pdu_type. */
804
805	/* Controllers only receive PDUs with a PDU direction of 0. */
806	if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) {
807		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
808		nvmf_tcp_report_error(na, qp,
809		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
810		    hlen);
811		return (false);
812	}
813
814	switch (ch->pdu_type) {
815	case NVME_TCP_PDU_TYPE_IC_REQ:
816	case NVME_TCP_PDU_TYPE_IC_RESP:
817		break;
818	default:
819		na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type);
820		nvmf_tcp_report_error(na, qp,
821		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len,
822		    hlen);
823		return (false);
824	}
825
826	/* Validate flags. */
827	if (ch->flags != 0) {
828		na_error(na, "NVMe/TCP: Invalid PDU header flags %#x",
829		    ch->flags);
830		nvmf_tcp_report_error(na, qp,
831		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len,
832		    hlen);
833		return (false);
834	}
835
836	/* Validate hlen. */
837	if (ch->hlen != 128) {
838		na_error(na, "NVMe/TCP: Invalid PDU header length %u",
839		    ch->hlen);
840		nvmf_tcp_report_error(na, qp,
841		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len,
842		    hlen);
843		return (false);
844	}
845
846	/* Validate pdo. */
847	if (ch->pdo != 0) {
848		na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo);
849		nvmf_tcp_report_error(na, qp,
850		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len,
851		    hlen);
852		return (false);
853	}
854
855	/* Validate plen. */
856	if (plen != 128) {
857		na_error(na, "NVMe/TCP: Invalid PDU length %u", plen);
858		nvmf_tcp_report_error(na, qp,
859		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len,
860		    hlen);
861		return (false);
862	}
863
864	/* Validate fields common to both ICReq and ICResp. */
865	pdu = (const struct nvme_tcp_ic_req *)ch;
866	if (le16toh(pdu->pfv) != 0) {
867		na_error(na, "NVMe/TCP: Unsupported PDU version %u",
868		    le16toh(pdu->pfv));
869		nvmf_tcp_report_error(na, qp,
870		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
871		    8, ch, pdu_len, hlen);
872		return (false);
873	}
874
875	if (pdu->hpda > NVME_TCP_HPDA_MAX) {
876		na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda);
877		nvmf_tcp_report_error(na, qp,
878		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len,
879		    hlen);
880		return (false);
881	}
882
883	if (pdu->dgst.bits.reserved != 0) {
884		na_error(na, "NVMe/TCP: Invalid digest settings");
885		nvmf_tcp_report_error(na, qp,
886		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len,
887		    hlen);
888		return (false);
889	}
890
891	return (true);
892}
893
894static bool
895nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
896    struct nvme_tcp_ic_req *pdu)
897{
898	int error;
899
900	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
901	if (error != 0) {
902		na_error(na, "NVMe/TCP: Failed to read IC request: %s",
903		    strerror(error));
904		return (false);
905	}
906
907	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
908}
909
910static bool
911nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp,
912    struct nvme_tcp_ic_resp *pdu)
913{
914	int error;
915
916	error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu));
917	if (error != 0) {
918		na_error(na, "NVMe/TCP: Failed to read IC response: %s",
919		    strerror(error));
920		return (false);
921	}
922
923	return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu)));
924}
925
926static struct nvmf_association *
927tcp_allocate_association(bool controller __unused,
928    const struct nvmf_association_params *params __unused)
929{
930	struct nvmf_tcp_association *ta;
931
932	ta = calloc(1, sizeof(*ta));
933
934	return (&ta->na);
935}
936
937static void
938tcp_update_association(struct nvmf_association *na,
939    const struct nvme_controller_data *cdata)
940{
941	struct nvmf_tcp_association *ta = TASSOC(na);
942
943	ta->ioccsz = le32toh(cdata->ioccsz);
944}
945
946static void
947tcp_free_association(struct nvmf_association *na)
948{
949	free(na);
950}
951
952static bool
953tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin)
954{
955	const struct nvmf_association_params *params = &na->na_params;
956	struct nvmf_tcp_association *ta = TASSOC(na);
957	struct nvme_tcp_ic_req ic_req;
958	struct nvme_tcp_ic_resp ic_resp;
959	int error;
960
961	if (!admin) {
962		if (ta->ioccsz == 0) {
963			na_error(na, "TCP I/O queues require cdata");
964			return (false);
965		}
966		if (ta->ioccsz < 4) {
967			na_error(na, "Invalid IOCCSZ %u", ta->ioccsz);
968			return (false);
969		}
970	}
971
972	memset(&ic_req, 0, sizeof(ic_req));
973	ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ;
974	ic_req.common.hlen = sizeof(ic_req);
975	ic_req.common.plen = htole32(sizeof(ic_req));
976	ic_req.pfv = htole16(0);
977	ic_req.hpda = params->tcp.pda;
978	if (params->tcp.header_digests)
979		ic_req.dgst.bits.hdgst_enable = 1;
980	if (params->tcp.data_digests)
981		ic_req.dgst.bits.ddgst_enable = 1;
982	ic_req.maxr2t = htole32(params->tcp.maxr2t);
983
984	error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req));
985	if (error != 0) {
986		na_error(na, "Failed to write IC request: %s", strerror(error));
987		return (false);
988	}
989
990	if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp))
991		return (false);
992
993	/* Ensure the controller didn't enable digests we didn't request. */
994	if ((!params->tcp.header_digests &&
995	    ic_resp.dgst.bits.hdgst_enable != 0) ||
996	    (!params->tcp.data_digests &&
997	    ic_resp.dgst.bits.ddgst_enable != 0)) {
998		na_error(na, "Controller enabled unrequested digests");
999		nvmf_tcp_report_error(na, qp,
1000		    NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER,
1001		    11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp));
1002		return (false);
1003	}
1004
1005	/*
1006	 * XXX: Is there an upper-bound to enforce here?  Perhaps pick
1007	 * some large value and report larger values as an unsupported
1008	 * parameter?
1009	 */
1010	if (le32toh(ic_resp.maxh2cdata) < 4096) {
1011		na_error(na, "Invalid MAXH2CDATA %u",
1012		    le32toh(ic_resp.maxh2cdata));
1013		nvmf_tcp_report_error(na, qp,
1014		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp,
1015		    sizeof(ic_resp), sizeof(ic_resp));
1016		return (false);
1017	}
1018
1019	qp->txpda = (params->tcp.pda + 1) * 4;
1020	qp->rxpda = (ic_resp.cpda + 1) * 4;
1021	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1022	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1023	qp->maxr2t = params->tcp.maxr2t;
1024	qp->maxh2cdata = le32toh(ic_resp.maxh2cdata);
1025	if (admin)
1026		/* 7.4.3 */
1027		qp->max_icd = 8192;
1028	else
1029		qp->max_icd = (ta->ioccsz - 4) * 16;
1030
1031	return (0);
1032}
1033
1034static bool
1035tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na)
1036{
1037	const struct nvmf_association_params *params = &na->na_params;
1038	struct nvme_tcp_ic_req ic_req;
1039	struct nvme_tcp_ic_resp ic_resp;
1040	int error;
1041
1042	if (!nvmf_tcp_read_ic_req(na, qp, &ic_req))
1043		return (false);
1044
1045	memset(&ic_resp, 0, sizeof(ic_resp));
1046	ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP;
1047	ic_resp.common.hlen = sizeof(ic_req);
1048	ic_resp.common.plen = htole32(sizeof(ic_req));
1049	ic_resp.pfv = htole16(0);
1050	ic_resp.cpda = params->tcp.pda;
1051	if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0)
1052		ic_resp.dgst.bits.hdgst_enable = 1;
1053	if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0)
1054		ic_resp.dgst.bits.ddgst_enable = 1;
1055	ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata);
1056
1057	error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp));
1058	if (error != 0) {
1059		na_error(na, "Failed to write IC response: %s",
1060		    strerror(error));
1061		return (false);
1062	}
1063
1064	qp->txpda = (params->tcp.pda + 1) * 4;
1065	qp->rxpda = (ic_req.hpda + 1) * 4;
1066	qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0;
1067	qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0;
1068	qp->maxr2t = le32toh(ic_req.maxr2t);
1069	qp->maxh2cdata = params->tcp.maxh2cdata;
1070	qp->max_icd = 0;	/* XXX */
1071	return (0);
1072}
1073
1074static struct nvmf_qpair *
1075tcp_allocate_qpair(struct nvmf_association *na,
1076    const struct nvmf_qpair_params *qparams)
1077{
1078	const struct nvmf_association_params *aparams = &na->na_params;
1079	struct nvmf_tcp_qpair *qp;
1080	int error;
1081
1082	if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) {
1083		na_error(na, "Invalid PDA");
1084		return (NULL);
1085	}
1086
1087	qp = calloc(1, sizeof(*qp));
1088	qp->s = qparams->tcp.fd;
1089	LIST_INIT(&qp->rx_buffers);
1090	LIST_INIT(&qp->tx_buffers);
1091	TAILQ_INIT(&qp->rx_capsules);
1092	if (na->na_controller)
1093		error = tcp_accept(qp, na);
1094	else
1095		error = tcp_connect(qp, na, qparams->admin);
1096	if (error != 0) {
1097		free(qp);
1098		return (NULL);
1099	}
1100
1101	return (&qp->qp);
1102}
1103
1104static void
1105tcp_free_qpair(struct nvmf_qpair *nq)
1106{
1107	struct nvmf_tcp_qpair *qp = TQP(nq);
1108	struct nvmf_tcp_capsule *ntc, *tc;
1109	struct nvmf_tcp_command_buffer *ncb, *cb;
1110
1111	TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) {
1112		TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1113		nvmf_free_capsule(&tc->nc);
1114	}
1115	LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) {
1116		tcp_free_command_buffer(cb);
1117	}
1118	LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) {
1119		tcp_free_command_buffer(cb);
1120	}
1121	free(qp);
1122}
1123
1124static int
1125tcp_kernel_handoff_params(struct nvmf_qpair *nq,
1126    struct nvmf_handoff_qpair_params *qparams)
1127{
1128	struct nvmf_tcp_qpair *qp = TQP(nq);
1129
1130	qparams->tcp.fd = qp->s;
1131	qparams->tcp.rxpda = qp->rxpda;
1132	qparams->tcp.txpda = qp->txpda;
1133	qparams->tcp.header_digests = qp->header_digests;
1134	qparams->tcp.data_digests = qp->data_digests;
1135	qparams->tcp.maxr2t = qp->maxr2t;
1136	qparams->tcp.maxh2cdata = qp->maxh2cdata;
1137	qparams->tcp.max_icd = qp->max_icd;
1138
1139	return (0);
1140}
1141
1142static struct nvmf_capsule *
1143tcp_allocate_capsule(struct nvmf_qpair *qp __unused)
1144{
1145	struct nvmf_tcp_capsule *nc;
1146
1147	nc = calloc(1, sizeof(*nc));
1148	return (&nc->nc);
1149}
1150
1151static void
1152tcp_free_capsule(struct nvmf_capsule *nc)
1153{
1154	struct nvmf_tcp_capsule *tc = TCAP(nc);
1155
1156	nvmf_tcp_free_pdu(&tc->rx_pdu);
1157	if (tc->cb != NULL)
1158		tcp_free_command_buffer(tc->cb);
1159	free(tc);
1160}
1161
1162static int
1163tcp_transmit_command(struct nvmf_capsule *nc)
1164{
1165	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1166	struct nvmf_tcp_capsule *tc = TCAP(nc);
1167	struct nvme_tcp_cmd cmd;
1168	struct nvme_sgl_descriptor *sgl;
1169	int error;
1170	bool use_icd;
1171
1172	use_icd = false;
1173	if (nc->nc_data_len != 0 && nc->nc_send_data &&
1174	    nc->nc_data_len <= qp->max_icd)
1175		use_icd = true;
1176
1177	memset(&cmd, 0, sizeof(cmd));
1178	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
1179	cmd.ccsqe = nc->nc_sqe;
1180
1181	/* Populate SGL in SQE. */
1182	sgl = &cmd.ccsqe.sgl;
1183	memset(sgl, 0, sizeof(*sgl));
1184	sgl->address = 0;
1185	sgl->length = htole32(nc->nc_data_len);
1186	if (use_icd) {
1187		/* Use in-capsule data. */
1188		sgl->type = NVME_SGL_TYPE_ICD;
1189	} else {
1190		/* Use a command buffer. */
1191		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
1192	}
1193
1194	/* Send command capsule. */
1195	error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ?
1196	    nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0);
1197	if (error != 0)
1198		return (error);
1199
1200	/*
1201	 * If data will be transferred using a command buffer, allocate a
1202	 * buffer structure and queue it.
1203	 */
1204	if (nc->nc_data_len != 0 && !use_icd)
1205		tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0,
1206		    nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data);
1207
1208	return (0);
1209}
1210
1211static int
1212tcp_transmit_response(struct nvmf_capsule *nc)
1213{
1214	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1215	struct nvme_tcp_rsp rsp;
1216
1217	memset(&rsp, 0, sizeof(rsp));
1218	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
1219	rsp.rccqe = nc->nc_cqe;
1220
1221	return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
1222}
1223
1224static int
1225tcp_transmit_capsule(struct nvmf_capsule *nc)
1226{
1227	if (nc->nc_qe_len == sizeof(struct nvme_command))
1228		return (tcp_transmit_command(nc));
1229	else
1230		return (tcp_transmit_response(nc));
1231}
1232
1233static int
1234tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp)
1235{
1236	struct nvmf_tcp_qpair *qp = TQP(nq);
1237	struct nvmf_tcp_capsule *tc;
1238	int error;
1239
1240	while (TAILQ_EMPTY(&qp->rx_capsules)) {
1241		error = nvmf_tcp_receive_pdu(qp);
1242		if (error != 0)
1243			return (error);
1244	}
1245	tc = TAILQ_FIRST(&qp->rx_capsules);
1246	TAILQ_REMOVE(&qp->rx_capsules, tc, link);
1247	*ncp = &tc->nc;
1248	return (0);
1249}
1250
1251static uint8_t
1252tcp_validate_command_capsule(const struct nvmf_capsule *nc)
1253{
1254	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1255	const struct nvme_sgl_descriptor *sgl;
1256
1257	assert(tc->rx_pdu.hdr != NULL);
1258
1259	sgl = &nc->nc_sqe.sgl;
1260	switch (sgl->type) {
1261	case NVME_SGL_TYPE_ICD:
1262		if (tc->rx_pdu.data_len != le32toh(sgl->length)) {
1263			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
1264			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
1265		}
1266		break;
1267	case NVME_SGL_TYPE_COMMAND_BUFFER:
1268		if (tc->rx_pdu.data_len != 0) {
1269			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
1270			return (NVME_SC_INVALID_FIELD);
1271		}
1272		break;
1273	default:
1274		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
1275		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
1276	}
1277
1278	if (sgl->address != 0) {
1279		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
1280		return (NVME_SC_SGL_OFFSET_INVALID);
1281	}
1282
1283	return (NVME_SC_SUCCESS);
1284}
1285
1286static size_t
1287tcp_capsule_data_len(const struct nvmf_capsule *nc)
1288{
1289	assert(nc->nc_qe_len == sizeof(struct nvme_command));
1290	return (le32toh(nc->nc_sqe.sgl.length));
1291}
1292
1293/* NB: cid and ttag are both little-endian already. */
1294static int
1295tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
1296    uint32_t data_offset, uint32_t data_len)
1297{
1298	struct nvme_tcp_r2t_hdr r2t;
1299
1300	memset(&r2t, 0, sizeof(r2t));
1301	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1302	r2t.cccid = cid;
1303	r2t.ttag = ttag;
1304	r2t.r2to = htole32(data_offset);
1305	r2t.r2tl = htole32(data_len);
1306
1307	return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0));
1308}
1309
1310static int
1311tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1312    void *buf, size_t len)
1313{
1314	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1315	struct nvmf_tcp_command_buffer *cb;
1316	int error;
1317	uint16_t ttag;
1318
1319	/*
1320	 * Don't bother byte-swapping ttag as it is just a cookie
1321	 * value returned by the other end as-is.
1322	 */
1323	ttag = qp->next_ttag++;
1324
1325	error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len);
1326	if (error != 0)
1327		return (error);
1328
1329	cb = tcp_alloc_command_buffer(qp, buf, data_offset, len,
1330	    nc->nc_sqe.cid, ttag, true);
1331
1332	/* Parse received PDUs until the data transfer is complete. */
1333	while (cb->data_xfered < cb->data_len) {
1334		error = nvmf_tcp_receive_pdu(qp);
1335		if (error != 0)
1336			break;
1337	}
1338	tcp_free_command_buffer(cb);
1339	return (error);
1340}
1341
1342static int
1343tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1344    void *buf, size_t len)
1345{
1346	const struct nvmf_tcp_capsule *tc = CTCAP(nc);
1347	const char *icd;
1348
1349	icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset;
1350	memcpy(buf, icd, len);
1351	return (0);
1352}
1353
1354static int
1355tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset,
1356    void *buf, size_t len)
1357{
1358	struct nvmf_association *na = nc->nc_qpair->nq_association;
1359	const struct nvme_sgl_descriptor *sgl;
1360	size_t data_len;
1361
1362	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1363		return (EINVAL);
1364
1365	sgl = &nc->nc_sqe.sgl;
1366	data_len = le32toh(sgl->length);
1367	if (data_offset + len > data_len)
1368		return (EFBIG);
1369
1370	if (sgl->type == NVME_SGL_TYPE_ICD)
1371		return (tcp_receive_icd_data(nc, data_offset, buf, len));
1372	else
1373		return (tcp_receive_r2t_data(nc, data_offset, buf, len));
1374}
1375
1376/* NB: cid is little-endian already. */
1377static int
1378tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid,
1379    uint32_t data_offset, const void *buf, size_t len, bool last_pdu,
1380    bool success)
1381{
1382	struct nvme_tcp_c2h_data_hdr c2h;
1383
1384	memset(&c2h, 0, sizeof(c2h));
1385	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
1386	if (last_pdu)
1387		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
1388	if (success)
1389		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
1390	c2h.cccid = cid;
1391	c2h.datao = htole32(data_offset);
1392	c2h.datal = htole32(len);
1393
1394	return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h),
1395	    __DECONST(void *, buf), len));
1396}
1397
1398static int
1399tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf,
1400    size_t len)
1401{
1402	struct nvmf_association *na = nc->nc_qpair->nq_association;
1403	struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair);
1404	const struct nvme_sgl_descriptor *sgl;
1405	const char *src;
1406	size_t todo;
1407	uint32_t data_len, data_offset;
1408	int error;
1409	bool last_pdu, send_success_flag;
1410
1411	if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller)
1412		return (EINVAL);
1413
1414	sgl = &nc->nc_sqe.sgl;
1415	data_len = le32toh(sgl->length);
1416	if (len != data_len) {
1417		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1418		return (EFBIG);
1419	}
1420
1421	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
1422		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
1423		return (EINVAL);
1424	}
1425
1426	/* Use the SUCCESS flag if SQ flow control is disabled. */
1427	send_success_flag = !qp->qp.nq_flow_control;
1428
1429	/*
1430	 * Write out one or more C2H_DATA PDUs containing the data.
1431	 * Each PDU is arbitrarily capped at 256k.
1432	 */
1433	data_offset = 0;
1434	src = buf;
1435	while (len > 0) {
1436		if (len > 256 * 1024) {
1437			todo = 256 * 1024;
1438			last_pdu = false;
1439		} else {
1440			todo = len;
1441			last_pdu = true;
1442		}
1443		error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset,
1444		    src, todo, last_pdu, last_pdu && send_success_flag);
1445		if (error != 0) {
1446			nvmf_send_generic_error(nc,
1447			    NVME_SC_TRANSIENT_TRANSPORT_ERROR);
1448			return (error);
1449		}
1450		data_offset += todo;
1451		src += todo;
1452		len -= todo;
1453	}
1454	if (!send_success_flag)
1455		nvmf_send_success(nc);
1456	return (0);
1457}
1458
1459struct nvmf_transport_ops tcp_ops = {
1460	.allocate_association = tcp_allocate_association,
1461	.update_association = tcp_update_association,
1462	.free_association = tcp_free_association,
1463	.allocate_qpair = tcp_allocate_qpair,
1464	.free_qpair = tcp_free_qpair,
1465	.kernel_handoff_params = tcp_kernel_handoff_params,
1466	.allocate_capsule = tcp_allocate_capsule,
1467	.free_capsule = tcp_free_capsule,
1468	.transmit_capsule = tcp_transmit_capsule,
1469	.receive_capsule = tcp_receive_capsule,
1470	.validate_command_capsule = tcp_validate_command_capsule,
1471	.capsule_data_len = tcp_capsule_data_len,
1472	.receive_controller_data = tcp_receive_controller_data,
1473	.send_controller_data = tcp_send_controller_data,
1474};
1475