icl_soft.c revision 264022
1/*-
2 * Copyright (c) 2012 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/dev/iscsi/icl.c 264022 2014-04-01 21:40:46Z trasz $
30 */
31
32/*
33 * iSCSI Common Layer.  It's used by both the initiator and target to send
34 * and receive iSCSI PDUs.
35 */
36
37#include <sys/param.h>
38#include <sys/capsicum.h>
39#include <sys/condvar.h>
40#include <sys/conf.h>
41#include <sys/file.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/mbuf.h>
46#include <sys/mutex.h>
47#include <sys/module.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/sx.h>
53#include <sys/uio.h>
54#include <vm/uma.h>
55#include <netinet/in.h>
56#include <netinet/tcp.h>
57
58#include "icl.h"
59#include "iscsi_proto.h"
60
61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62static int debug = 1;
63TUNABLE_INT("kern.icl.debug", &debug);
64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65    &debug, 1, "Enable debug messages");
66static int partial_receive_len = 1 * 1024; /* XXX: More? */
67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69    &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70    "data segment");
71
72static uma_zone_t icl_conn_zone;
73static uma_zone_t icl_pdu_zone;
74
75static volatile u_int	icl_ncons;
76
77#define	ICL_DEBUG(X, ...)						\
78	do {								\
79		if (debug > 1)						\
80			printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
81	} while (0)
82
83#define	ICL_WARN(X, ...)						\
84	do {								\
85		if (debug > 0) {					\
86			printf("WARNING: %s: " X "\n",			\
87			    __func__, ## __VA_ARGS__);			\
88		}							\
89	} while (0)
90
91#define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
92#define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
93#define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
94#define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
95
96static void
97icl_conn_fail(struct icl_conn *ic)
98{
99	if (ic->ic_socket == NULL)
100		return;
101
102	/*
103	 * XXX
104	 */
105	ic->ic_socket->so_error = EDOOFUS;
106	(ic->ic_error)(ic);
107}
108
109static struct mbuf *
110icl_conn_receive(struct icl_conn *ic, size_t len)
111{
112	struct uio uio;
113	struct socket *so;
114	struct mbuf *m;
115	int error, flags;
116
117	so = ic->ic_socket;
118
119	memset(&uio, 0, sizeof(uio));
120	uio.uio_resid = len;
121
122	flags = MSG_DONTWAIT;
123	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
124	if (error != 0) {
125		ICL_DEBUG("soreceive error %d", error);
126		return (NULL);
127	}
128	if (uio.uio_resid != 0) {
129		m_freem(m);
130		ICL_DEBUG("short read");
131		return (NULL);
132	}
133
134	return (m);
135}
136
137static struct icl_pdu *
138icl_pdu_new(struct icl_conn *ic, int flags)
139{
140	struct icl_pdu *ip;
141
142#ifdef DIAGNOSTIC
143	refcount_acquire(&ic->ic_outstanding_pdus);
144#endif
145	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
146	if (ip == NULL) {
147		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
148#ifdef DIAGNOSTIC
149		refcount_release(&ic->ic_outstanding_pdus);
150#endif
151		return (NULL);
152	}
153
154	ip->ip_conn = ic;
155
156	return (ip);
157}
158
159void
160icl_pdu_free(struct icl_pdu *ip)
161{
162	struct icl_conn *ic;
163
164	ic = ip->ip_conn;
165
166	m_freem(ip->ip_bhs_mbuf);
167	m_freem(ip->ip_ahs_mbuf);
168	m_freem(ip->ip_data_mbuf);
169	uma_zfree(icl_pdu_zone, ip);
170#ifdef DIAGNOSTIC
171	refcount_release(&ic->ic_outstanding_pdus);
172#endif
173}
174
175/*
176 * Allocate icl_pdu with empty BHS to fill up by the caller.
177 */
178struct icl_pdu *
179icl_pdu_new_bhs(struct icl_conn *ic, int flags)
180{
181	struct icl_pdu *ip;
182
183	ip = icl_pdu_new(ic, flags);
184	if (ip == NULL)
185		return (NULL);
186
187	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
188	    flags, MT_DATA, M_PKTHDR);
189	if (ip->ip_bhs_mbuf == NULL) {
190		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
191		icl_pdu_free(ip);
192		return (NULL);
193	}
194	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
195	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
196	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
197
198	return (ip);
199}
200
201static int
202icl_pdu_ahs_length(const struct icl_pdu *request)
203{
204
205	return (request->ip_bhs->bhs_total_ahs_len * 4);
206}
207
208size_t
209icl_pdu_data_segment_length(const struct icl_pdu *request)
210{
211	uint32_t len = 0;
212
213	len += request->ip_bhs->bhs_data_segment_len[0];
214	len <<= 8;
215	len += request->ip_bhs->bhs_data_segment_len[1];
216	len <<= 8;
217	len += request->ip_bhs->bhs_data_segment_len[2];
218
219	return (len);
220}
221
222static void
223icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
224{
225
226	response->ip_bhs->bhs_data_segment_len[2] = len;
227	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
228	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
229}
230
231static size_t
232icl_pdu_padding(const struct icl_pdu *ip)
233{
234
235	if ((ip->ip_data_len % 4) != 0)
236		return (4 - (ip->ip_data_len % 4));
237
238	return (0);
239}
240
241static size_t
242icl_pdu_size(const struct icl_pdu *response)
243{
244	size_t len;
245
246	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
247
248	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
249	    icl_pdu_padding(response);
250	if (response->ip_conn->ic_header_crc32c)
251		len += ISCSI_HEADER_DIGEST_SIZE;
252	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
253		len += ISCSI_DATA_DIGEST_SIZE;
254
255	return (len);
256}
257
258static int
259icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
260{
261	struct mbuf *m;
262
263	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
264	if (m == NULL) {
265		ICL_DEBUG("failed to receive BHS");
266		return (-1);
267	}
268
269	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
270	if (request->ip_bhs_mbuf == NULL) {
271		ICL_WARN("m_pullup failed");
272		return (-1);
273	}
274	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
275
276	/*
277	 * XXX: For architectures with strict alignment requirements
278	 * 	we may need to allocate ip_bhs and copy the data into it.
279	 * 	For some reason, though, not doing this doesn't seem
280	 * 	to cause problems; tested on sparc64.
281	 */
282
283	*availablep -= sizeof(struct iscsi_bhs);
284	return (0);
285}
286
287static int
288icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
289{
290
291	request->ip_ahs_len = icl_pdu_ahs_length(request);
292	if (request->ip_ahs_len == 0)
293		return (0);
294
295	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
296	    request->ip_ahs_len);
297	if (request->ip_ahs_mbuf == NULL) {
298		ICL_DEBUG("failed to receive AHS");
299		return (-1);
300	}
301
302	*availablep -= request->ip_ahs_len;
303	return (0);
304}
305
306static uint32_t
307icl_mbuf_to_crc32c(const struct mbuf *m0)
308{
309	uint32_t digest = 0xffffffff;
310	const struct mbuf *m;
311
312	for (m = m0; m != NULL; m = m->m_next)
313		digest = calculate_crc32c(digest,
314		    mtod(m, const void *), m->m_len);
315
316	digest = digest ^ 0xffffffff;
317
318	return (digest);
319}
320
321static int
322icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
323{
324	struct mbuf *m;
325	uint32_t received_digest, valid_digest;
326
327	if (request->ip_conn->ic_header_crc32c == false)
328		return (0);
329
330	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
331	if (m == NULL) {
332		ICL_DEBUG("failed to receive header digest");
333		return (-1);
334	}
335
336	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
337	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
338	m_freem(m);
339
340	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
341
342	/*
343	 * XXX: Handle AHS.
344	 */
345	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
346	if (received_digest != valid_digest) {
347		ICL_WARN("header digest check failed; got 0x%x, "
348		    "should be 0x%x", received_digest, valid_digest);
349		return (-1);
350	}
351
352	return (0);
353}
354
355/*
356 * Return the number of bytes that should be waiting in the receive socket
357 * before icl_pdu_receive_data_segment() gets called.
358 */
359static size_t
360icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
361{
362	size_t len;
363
364	len = icl_pdu_data_segment_length(request);
365	if (len == 0)
366		return (0);
367
368	/*
369	 * Account for the parts of data segment already read from
370	 * the socket buffer.
371	 */
372	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
373	len -= request->ip_data_len;
374
375	/*
376	 * Don't always wait for the full data segment to be delivered
377	 * to the socket; this might badly affect performance due to
378	 * TCP window scaling.
379	 */
380	if (len > partial_receive_len) {
381#if 0
382		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
383		    len, partial_receive_len));
384#endif
385		len = partial_receive_len;
386
387		return (len);
388	}
389
390	/*
391	 * Account for padding.  Note that due to the way code is written,
392	 * the icl_pdu_receive_data_segment() must always receive padding
393	 * along with the last part of data segment, because it would be
394	 * impossible to tell whether we've already received the full data
395	 * segment including padding, or without it.
396	 */
397	if ((len % 4) != 0)
398		len += 4 - (len % 4);
399
400#if 0
401	ICL_DEBUG("need %zd bytes of data", len));
402#endif
403
404	return (len);
405}
406
407static int
408icl_pdu_receive_data_segment(struct icl_pdu *request,
409    size_t *availablep, bool *more_neededp)
410{
411	struct icl_conn *ic;
412	size_t len, padding = 0;
413	struct mbuf *m;
414
415	ic = request->ip_conn;
416
417	*more_neededp = false;
418	ic->ic_receive_len = 0;
419
420	len = icl_pdu_data_segment_length(request);
421	if (len == 0)
422		return (0);
423
424	if ((len % 4) != 0)
425		padding = 4 - (len % 4);
426
427	/*
428	 * Account for already received parts of data segment.
429	 */
430	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
431	len -= request->ip_data_len;
432
433	if (len + padding > *availablep) {
434		/*
435		 * Not enough data in the socket buffer.  Receive as much
436		 * as we can.  Don't receive padding, since, obviously, it's
437		 * not the end of data segment yet.
438		 */
439#if 0
440		ICL_DEBUG("limited from %zd to %zd",
441		    len + padding, *availablep - padding));
442#endif
443		len = *availablep - padding;
444		*more_neededp = true;
445		padding = 0;
446	}
447
448	/*
449	 * Must not try to receive padding without at least one byte
450	 * of actual data segment.
451	 */
452	if (len > 0) {
453		m = icl_conn_receive(request->ip_conn, len + padding);
454		if (m == NULL) {
455			ICL_DEBUG("failed to receive data segment");
456			return (-1);
457		}
458
459		if (request->ip_data_mbuf == NULL)
460			request->ip_data_mbuf = m;
461		else
462			m_cat(request->ip_data_mbuf, m);
463
464		request->ip_data_len += len;
465		*availablep -= len + padding;
466	} else
467		ICL_DEBUG("len 0");
468
469	if (*more_neededp)
470		ic->ic_receive_len =
471		    icl_pdu_data_segment_receive_len(request);
472
473	return (0);
474}
475
476static int
477icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
478{
479	struct mbuf *m;
480	uint32_t received_digest, valid_digest;
481
482	if (request->ip_conn->ic_data_crc32c == false)
483		return (0);
484
485	if (request->ip_data_len == 0)
486		return (0);
487
488	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
489	if (m == NULL) {
490		ICL_DEBUG("failed to receive data digest");
491		return (-1);
492	}
493
494	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
495	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
496	m_freem(m);
497
498	*availablep -= ISCSI_DATA_DIGEST_SIZE;
499
500	/*
501	 * Note that ip_data_mbuf also contains padding; since digest
502	 * calculation is supposed to include that, we iterate over
503	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
504	 */
505	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
506	if (received_digest != valid_digest) {
507		ICL_WARN("data digest check failed; got 0x%x, "
508		    "should be 0x%x", received_digest, valid_digest);
509		return (-1);
510	}
511
512	return (0);
513}
514
515/*
516 * Somewhat contrary to the name, this attempts to receive only one
517 * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
518 */
519static struct icl_pdu *
520icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
521{
522	struct icl_pdu *request;
523	struct socket *so;
524	size_t len;
525	int error;
526	bool more_needed;
527
528	so = ic->ic_socket;
529
530	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
531		KASSERT(ic->ic_receive_pdu == NULL,
532		    ("ic->ic_receive_pdu != NULL"));
533		request = icl_pdu_new(ic, M_NOWAIT);
534		if (request == NULL) {
535			ICL_DEBUG("failed to allocate PDU; "
536			    "dropping connection");
537			icl_conn_fail(ic);
538			return (NULL);
539		}
540		ic->ic_receive_pdu = request;
541	} else {
542		KASSERT(ic->ic_receive_pdu != NULL,
543		    ("ic->ic_receive_pdu == NULL"));
544		request = ic->ic_receive_pdu;
545	}
546
547	if (*availablep < ic->ic_receive_len) {
548#if 0
549		ICL_DEBUG("not enough data; need %zd, "
550		    "have %zd", ic->ic_receive_len, *availablep);
551#endif
552		return (NULL);
553	}
554
555	switch (ic->ic_receive_state) {
556	case ICL_CONN_STATE_BHS:
557		//ICL_DEBUG("receiving BHS");
558		error = icl_pdu_receive_bhs(request, availablep);
559		if (error != 0) {
560			ICL_DEBUG("failed to receive BHS; "
561			    "dropping connection");
562			break;
563		}
564
565		/*
566		 * We don't enforce any limit for AHS length;
567		 * its length is stored in 8 bit field.
568		 */
569
570		len = icl_pdu_data_segment_length(request);
571		if (len > ic->ic_max_data_segment_length) {
572			ICL_WARN("received data segment "
573			    "length %zd is larger than negotiated "
574			    "MaxDataSegmentLength %zd; "
575			    "dropping connection",
576			    len, ic->ic_max_data_segment_length);
577			error = EINVAL;
578			break;
579		}
580
581		ic->ic_receive_state = ICL_CONN_STATE_AHS;
582		ic->ic_receive_len = icl_pdu_ahs_length(request);
583		break;
584
585	case ICL_CONN_STATE_AHS:
586		//ICL_DEBUG("receiving AHS");
587		error = icl_pdu_receive_ahs(request, availablep);
588		if (error != 0) {
589			ICL_DEBUG("failed to receive AHS; "
590			    "dropping connection");
591			break;
592		}
593		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
594		if (ic->ic_header_crc32c == false)
595			ic->ic_receive_len = 0;
596		else
597			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
598		break;
599
600	case ICL_CONN_STATE_HEADER_DIGEST:
601		//ICL_DEBUG("receiving header digest");
602		error = icl_pdu_check_header_digest(request, availablep);
603		if (error != 0) {
604			ICL_DEBUG("header digest failed; "
605			    "dropping connection");
606			break;
607		}
608
609		ic->ic_receive_state = ICL_CONN_STATE_DATA;
610		ic->ic_receive_len =
611		    icl_pdu_data_segment_receive_len(request);
612		break;
613
614	case ICL_CONN_STATE_DATA:
615		//ICL_DEBUG("receiving data segment");
616		error = icl_pdu_receive_data_segment(request, availablep,
617		    &more_needed);
618		if (error != 0) {
619			ICL_DEBUG("failed to receive data segment;"
620			    "dropping connection");
621			break;
622		}
623
624		if (more_needed)
625			break;
626
627		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
628		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
629			ic->ic_receive_len = 0;
630		else
631			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
632		break;
633
634	case ICL_CONN_STATE_DATA_DIGEST:
635		//ICL_DEBUG("receiving data digest");
636		error = icl_pdu_check_data_digest(request, availablep);
637		if (error != 0) {
638			ICL_DEBUG("data digest failed; "
639			    "dropping connection");
640			break;
641		}
642
643		/*
644		 * We've received complete PDU; reset the receive state machine
645		 * and return the PDU.
646		 */
647		ic->ic_receive_state = ICL_CONN_STATE_BHS;
648		ic->ic_receive_len = sizeof(struct iscsi_bhs);
649		ic->ic_receive_pdu = NULL;
650		return (request);
651
652	default:
653		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
654	}
655
656	if (error != 0) {
657		icl_pdu_free(request);
658		icl_conn_fail(ic);
659	}
660
661	return (NULL);
662}
663
664static void
665icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
666{
667	struct icl_pdu *response;
668	struct socket *so;
669
670	so = ic->ic_socket;
671
672	/*
673	 * This can never happen; we're careful to only mess with ic->ic_socket
674	 * pointer when the send/receive threads are not running.
675	 */
676	KASSERT(so != NULL, ("NULL socket"));
677
678	for (;;) {
679		if (ic->ic_disconnecting)
680			return;
681
682		if (so->so_error != 0) {
683			ICL_DEBUG("connection error %d; "
684			    "dropping connection", so->so_error);
685			icl_conn_fail(ic);
686			return;
687		}
688
689		/*
690		 * Loop until we have a complete PDU or there is not enough
691		 * data in the socket buffer.
692		 */
693		if (available < ic->ic_receive_len) {
694#if 0
695			ICL_DEBUG("not enough data; have %zd, "
696			    "need %zd", available,
697			    ic->ic_receive_len);
698#endif
699			return;
700		}
701
702		response = icl_conn_receive_pdu(ic, &available);
703		if (response == NULL)
704			continue;
705
706		if (response->ip_ahs_len > 0) {
707			ICL_WARN("received PDU with unsupported "
708			    "AHS; opcode 0x%x; dropping connection",
709			    response->ip_bhs->bhs_opcode);
710			icl_pdu_free(response);
711			icl_conn_fail(ic);
712			return;
713		}
714
715		(ic->ic_receive)(response);
716	}
717}
718
719static void
720icl_receive_thread(void *arg)
721{
722	struct icl_conn *ic;
723	size_t available;
724	struct socket *so;
725
726	ic = arg;
727	so = ic->ic_socket;
728
729	ICL_CONN_LOCK(ic);
730	ic->ic_receive_running = true;
731	ICL_CONN_UNLOCK(ic);
732
733	for (;;) {
734		if (ic->ic_disconnecting) {
735			//ICL_DEBUG("terminating");
736			break;
737		}
738
739		SOCKBUF_LOCK(&so->so_rcv);
740		available = so->so_rcv.sb_cc;
741		if (available < ic->ic_receive_len) {
742			so->so_rcv.sb_lowat = ic->ic_receive_len;
743			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
744		}
745		SOCKBUF_UNLOCK(&so->so_rcv);
746
747		icl_conn_receive_pdus(ic, available);
748	}
749
750	ICL_CONN_LOCK(ic);
751	ic->ic_receive_running = false;
752	ICL_CONN_UNLOCK(ic);
753	kthread_exit();
754}
755
756static int
757icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
758{
759	struct icl_conn *ic;
760
761	ic = arg;
762	cv_signal(&ic->ic_receive_cv);
763	return (SU_OK);
764}
765
766static int
767icl_pdu_send(struct icl_pdu *request)
768{
769	size_t padding, pdu_len;
770	uint32_t digest, zero = 0;
771	int error, ok;
772	struct socket *so;
773	struct icl_conn *ic;
774
775	ic = request->ip_conn;
776	so = request->ip_conn->ic_socket;
777
778	ICL_CONN_LOCK_ASSERT(ic);
779
780	icl_pdu_set_data_segment_length(request, request->ip_data_len);
781
782	pdu_len = icl_pdu_size(request);
783
784	if (ic->ic_header_crc32c) {
785		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
786		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
787		    (void *)&digest);
788		if (ok != 1) {
789			ICL_WARN("failed to append header digest");
790			return (1);
791		}
792	}
793
794	if (request->ip_data_len != 0) {
795		padding = icl_pdu_padding(request);
796		if (padding > 0) {
797			ok = m_append(request->ip_data_mbuf, padding,
798			    (void *)&zero);
799			if (ok != 1) {
800				ICL_WARN("failed to append padding");
801				return (1);
802			}
803		}
804
805		if (ic->ic_data_crc32c) {
806			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
807
808			ok = m_append(request->ip_data_mbuf, sizeof(digest),
809			    (void *)&digest);
810			if (ok != 1) {
811				ICL_WARN("failed to append header digest");
812				return (1);
813			}
814		}
815
816		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
817		request->ip_data_mbuf = NULL;
818	}
819
820	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
821
822	error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
823	    NULL, MSG_DONTWAIT, curthread);
824	request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
825	if (error != 0) {
826		ICL_DEBUG("sosend error %d", error);
827		return (error);
828	}
829
830	return (0);
831}
832
833static void
834icl_conn_send_pdus(struct icl_conn *ic)
835{
836	struct icl_pdu *request;
837	struct socket *so;
838	size_t available, size;
839	int error;
840
841	ICL_CONN_LOCK_ASSERT(ic);
842
843	so = ic->ic_socket;
844
845	SOCKBUF_LOCK(&so->so_snd);
846	available = sbspace(&so->so_snd);
847	SOCKBUF_UNLOCK(&so->so_snd);
848
849	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
850		if (ic->ic_disconnecting)
851			return;
852
853		request = TAILQ_FIRST(&ic->ic_to_send);
854		size = icl_pdu_size(request);
855		if (available < size) {
856			/*
857			 * Set the low watermark on the socket,
858			 * to avoid waking up until there is enough
859			 * space.
860			 */
861			SOCKBUF_LOCK(&so->so_snd);
862			so->so_snd.sb_lowat = size;
863			SOCKBUF_UNLOCK(&so->so_snd);
864#if 1
865			ICL_DEBUG("no space to send; "
866			    "have %zd, need %zd",
867			    available, size);
868#endif
869			return;
870		}
871		available -= size;
872		TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
873		error = icl_pdu_send(request);
874		if (error != 0) {
875			ICL_DEBUG("failed to send PDU; "
876			    "dropping connection");
877			icl_conn_fail(ic);
878			return;
879		}
880		icl_pdu_free(request);
881	}
882}
883
884static void
885icl_send_thread(void *arg)
886{
887	struct icl_conn *ic;
888
889	ic = arg;
890
891	ICL_CONN_LOCK(ic);
892	ic->ic_send_running = true;
893
894	for (;;) {
895		if (ic->ic_disconnecting) {
896			//ICL_DEBUG("terminating");
897			break;
898		}
899		icl_conn_send_pdus(ic);
900		cv_wait(&ic->ic_send_cv, ic->ic_lock);
901	}
902
903	ic->ic_send_running = false;
904	ICL_CONN_UNLOCK(ic);
905	kthread_exit();
906}
907
908static int
909icl_soupcall_send(struct socket *so, void *arg, int waitflag)
910{
911	struct icl_conn *ic;
912
913	ic = arg;
914	cv_signal(&ic->ic_send_cv);
915	return (SU_OK);
916}
917
918int
919icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
920{
921	struct mbuf *mb, *newmb;
922	size_t copylen, off = 0;
923
924	KASSERT(len > 0, ("len == 0"));
925
926	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
927	if (newmb == NULL) {
928		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
929		return (ENOMEM);
930	}
931
932	for (mb = newmb; mb != NULL; mb = mb->m_next) {
933		copylen = min(M_TRAILINGSPACE(mb), len - off);
934		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
935		mb->m_len = copylen;
936		off += copylen;
937	}
938	KASSERT(off == len, ("%s: off != len", __func__));
939
940	if (request->ip_data_mbuf == NULL) {
941		request->ip_data_mbuf = newmb;
942		request->ip_data_len = len;
943	} else {
944		m_cat(request->ip_data_mbuf, newmb);
945		request->ip_data_len += len;
946	}
947
948	return (0);
949}
950
951void
952icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
953{
954
955	m_copydata(ip->ip_data_mbuf, off, len, addr);
956}
957
958void
959icl_pdu_queue(struct icl_pdu *ip)
960{
961	struct icl_conn *ic;
962
963	ic = ip->ip_conn;
964
965	ICL_CONN_LOCK_ASSERT(ic);
966
967	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
968		ICL_DEBUG("icl_pdu_queue on closed connection");
969		icl_pdu_free(ip);
970		return;
971	}
972	TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
973	cv_signal(&ic->ic_send_cv);
974}
975
976struct icl_conn *
977icl_conn_new(struct mtx *lock)
978{
979	struct icl_conn *ic;
980
981	refcount_acquire(&icl_ncons);
982
983	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
984
985	TAILQ_INIT(&ic->ic_to_send);
986	ic->ic_lock = lock;
987	cv_init(&ic->ic_send_cv, "icl_tx");
988	cv_init(&ic->ic_receive_cv, "icl_rx");
989#ifdef DIAGNOSTIC
990	refcount_init(&ic->ic_outstanding_pdus, 0);
991#endif
992	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
993
994	return (ic);
995}
996
997void
998icl_conn_free(struct icl_conn *ic)
999{
1000
1001	cv_destroy(&ic->ic_send_cv);
1002	cv_destroy(&ic->ic_receive_cv);
1003	uma_zfree(icl_conn_zone, ic);
1004	refcount_release(&icl_ncons);
1005}
1006
1007static int
1008icl_conn_start(struct icl_conn *ic)
1009{
1010	size_t bufsize;
1011	struct sockopt opt;
1012	int error, one = 1;
1013
1014	ICL_CONN_LOCK(ic);
1015
1016	/*
1017	 * XXX: Ugly hack.
1018	 */
1019	if (ic->ic_socket == NULL) {
1020		ICL_CONN_UNLOCK(ic);
1021		return (EINVAL);
1022	}
1023
1024	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1025	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1026	ic->ic_disconnecting = false;
1027
1028	ICL_CONN_UNLOCK(ic);
1029
1030	/*
1031	 * Use max available sockbuf size for sending.  Do it manually
1032	 * instead of sbreserve(9) to work around resource limits.
1033	 *
1034	 * XXX: This kind of sucks.  On one hand, we don't currently support
1035	 *	sending a part of data segment; we always do it in one piece,
1036	 *	so we have to make sure it can fit in the socket buffer.
1037	 *	Once I've implemented partial send, we'll get rid of this
1038	 *	and use autoscaling.
1039	 */
1040        bufsize = (sizeof(struct iscsi_bhs) +
1041            ic->ic_max_data_segment_length) * 8;
1042	error = soreserve(ic->ic_socket, bufsize, bufsize);
1043	if (error != 0) {
1044		ICL_WARN("soreserve failed with error %d", error);
1045		icl_conn_close(ic);
1046		return (error);
1047	}
1048
1049	/*
1050	 * Disable Nagle.
1051	 */
1052	bzero(&opt, sizeof(opt));
1053	opt.sopt_dir = SOPT_SET;
1054	opt.sopt_level = IPPROTO_TCP;
1055	opt.sopt_name = TCP_NODELAY;
1056	opt.sopt_val = &one;
1057	opt.sopt_valsize = sizeof(one);
1058	error = sosetopt(ic->ic_socket, &opt);
1059	if (error != 0) {
1060		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1061		icl_conn_close(ic);
1062		return (error);
1063	}
1064
1065	/*
1066	 * Start threads.
1067	 */
1068	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1069	if (error != 0) {
1070		ICL_WARN("kthread_add(9) failed with error %d", error);
1071		icl_conn_close(ic);
1072		return (error);
1073	}
1074
1075	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1076	if (error != 0) {
1077		ICL_WARN("kthread_add(9) failed with error %d", error);
1078		icl_conn_close(ic);
1079		return (error);
1080	}
1081
1082	/*
1083	 * Register socket upcall, to get notified about incoming PDUs
1084	 * and free space to send outgoing ones.
1085	 */
1086	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1087	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1088	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1089	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1090	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1091	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1092
1093	return (0);
1094}
1095
1096int
1097icl_conn_handoff(struct icl_conn *ic, int fd)
1098{
1099	struct file *fp;
1100	struct socket *so;
1101	cap_rights_t rights;
1102	int error;
1103
1104	ICL_CONN_LOCK_ASSERT_NOT(ic);
1105
1106	/*
1107	 * Steal the socket from userland.
1108	 */
1109	error = fget(curthread, fd,
1110	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1111	if (error != 0)
1112		return (error);
1113	if (fp->f_type != DTYPE_SOCKET) {
1114		fdrop(fp, curthread);
1115		return (EINVAL);
1116	}
1117	so = fp->f_data;
1118	if (so->so_type != SOCK_STREAM) {
1119		fdrop(fp, curthread);
1120		return (EINVAL);
1121	}
1122
1123	ICL_CONN_LOCK(ic);
1124
1125	if (ic->ic_socket != NULL) {
1126		ICL_CONN_UNLOCK(ic);
1127		fdrop(fp, curthread);
1128		return (EBUSY);
1129	}
1130
1131	ic->ic_socket = fp->f_data;
1132	fp->f_ops = &badfileops;
1133	fp->f_data = NULL;
1134	fdrop(fp, curthread);
1135	ICL_CONN_UNLOCK(ic);
1136
1137	error = icl_conn_start(ic);
1138
1139	return (error);
1140}
1141
1142void
1143icl_conn_shutdown(struct icl_conn *ic)
1144{
1145	ICL_CONN_LOCK_ASSERT_NOT(ic);
1146
1147	ICL_CONN_LOCK(ic);
1148	if (ic->ic_socket == NULL) {
1149		ICL_CONN_UNLOCK(ic);
1150		return;
1151	}
1152	ICL_CONN_UNLOCK(ic);
1153
1154	soshutdown(ic->ic_socket, SHUT_RDWR);
1155}
1156
1157void
1158icl_conn_close(struct icl_conn *ic)
1159{
1160	struct icl_pdu *pdu;
1161
1162	ICL_CONN_LOCK_ASSERT_NOT(ic);
1163
1164	ICL_CONN_LOCK(ic);
1165	if (ic->ic_socket == NULL) {
1166		ICL_CONN_UNLOCK(ic);
1167		return;
1168	}
1169
1170	ic->ic_disconnecting = true;
1171
1172	/*
1173	 * Wake up the threads, so they can properly terminate.
1174	 */
1175	cv_signal(&ic->ic_receive_cv);
1176	cv_signal(&ic->ic_send_cv);
1177	while (ic->ic_receive_running || ic->ic_send_running) {
1178		//ICL_DEBUG("waiting for send/receive threads to terminate");
1179		ICL_CONN_UNLOCK(ic);
1180		cv_signal(&ic->ic_receive_cv);
1181		cv_signal(&ic->ic_send_cv);
1182		pause("icl_close", 1 * hz);
1183		ICL_CONN_LOCK(ic);
1184	}
1185	//ICL_DEBUG("send/receive threads terminated");
1186
1187	soclose(ic->ic_socket);
1188	ic->ic_socket = NULL;
1189
1190	if (ic->ic_receive_pdu != NULL) {
1191		//ICL_DEBUG("freeing partially received PDU");
1192		icl_pdu_free(ic->ic_receive_pdu);
1193		ic->ic_receive_pdu = NULL;
1194	}
1195
1196	/*
1197	 * Remove any outstanding PDUs from the send queue.
1198	 */
1199	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1200		pdu = TAILQ_FIRST(&ic->ic_to_send);
1201		TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1202		icl_pdu_free(pdu);
1203	}
1204
1205	KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1206	    ("destroying session with non-empty send queue"));
1207	/*
1208	 * XXX
1209	 */
1210#if 0
1211	KASSERT(ic->ic_outstanding_pdus == 0,
1212	    ("destroying session with %d outstanding PDUs",
1213	     ic->ic_outstanding_pdus));
1214#endif
1215	ICL_CONN_UNLOCK(ic);
1216}
1217
1218bool
1219icl_conn_connected(struct icl_conn *ic)
1220{
1221	ICL_CONN_LOCK_ASSERT_NOT(ic);
1222
1223	ICL_CONN_LOCK(ic);
1224	if (ic->ic_socket == NULL) {
1225		ICL_CONN_UNLOCK(ic);
1226		return (false);
1227	}
1228	if (ic->ic_socket->so_error != 0) {
1229		ICL_CONN_UNLOCK(ic);
1230		return (false);
1231	}
1232	ICL_CONN_UNLOCK(ic);
1233	return (true);
1234}
1235
1236#ifdef ICL_KERNEL_PROXY
1237int
1238icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1239{
1240	int error;
1241
1242	ICL_CONN_LOCK_ASSERT_NOT(ic);
1243
1244	if (so->so_type != SOCK_STREAM)
1245		return (EINVAL);
1246
1247	ICL_CONN_LOCK(ic);
1248	if (ic->ic_socket != NULL) {
1249		ICL_CONN_UNLOCK(ic);
1250		return (EBUSY);
1251	}
1252	ic->ic_socket = so;
1253	ICL_CONN_UNLOCK(ic);
1254
1255	error = icl_conn_start(ic);
1256
1257	return (error);
1258}
1259#endif /* ICL_KERNEL_PROXY */
1260
1261static int
1262icl_unload(void)
1263{
1264
1265	if (icl_ncons != 0)
1266		return (EBUSY);
1267
1268	uma_zdestroy(icl_conn_zone);
1269	uma_zdestroy(icl_pdu_zone);
1270
1271	return (0);
1272}
1273
1274static void
1275icl_load(void)
1276{
1277
1278	icl_conn_zone = uma_zcreate("icl_conn",
1279	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1280	    UMA_ALIGN_PTR, 0);
1281	icl_pdu_zone = uma_zcreate("icl_pdu",
1282	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1283	    UMA_ALIGN_PTR, 0);
1284
1285	refcount_init(&icl_ncons, 0);
1286}
1287
1288static int
1289icl_modevent(module_t mod, int what, void *arg)
1290{
1291
1292	switch (what) {
1293	case MOD_LOAD:
1294		icl_load();
1295		return (0);
1296	case MOD_UNLOAD:
1297		return (icl_unload());
1298	default:
1299		return (EINVAL);
1300	}
1301}
1302
1303moduledata_t icl_data = {
1304	"icl",
1305	icl_modevent,
1306	0
1307};
1308
1309DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1310MODULE_VERSION(icl, 1);
1311