icl_soft.c revision 264023
1/*-
2 * Copyright (c) 2012 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/dev/iscsi/icl.c 264023 2014-04-01 21:47:22Z trasz $
30 */
31
32/*
33 * iSCSI Common Layer.  It's used by both the initiator and target to send
34 * and receive iSCSI PDUs.
35 */
36
37#include <sys/param.h>
38#include <sys/capsicum.h>
39#include <sys/condvar.h>
40#include <sys/conf.h>
41#include <sys/file.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/mbuf.h>
46#include <sys/mutex.h>
47#include <sys/module.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/sx.h>
53#include <sys/uio.h>
54#include <vm/uma.h>
55#include <netinet/in.h>
56#include <netinet/tcp.h>
57
58#include "icl.h"
59#include "iscsi_proto.h"
60
61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62static int debug = 1;
63TUNABLE_INT("kern.icl.debug", &debug);
64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65    &debug, 1, "Enable debug messages");
66static int partial_receive_len = 1 * 1024; /* XXX: More? */
67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69    &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70    "data segment");
71
72static uma_zone_t icl_conn_zone;
73static uma_zone_t icl_pdu_zone;
74
75static volatile u_int	icl_ncons;
76
77#define	ICL_DEBUG(X, ...)						\
78	do {								\
79		if (debug > 1)						\
80			printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
81	} while (0)
82
83#define	ICL_WARN(X, ...)						\
84	do {								\
85		if (debug > 0) {					\
86			printf("WARNING: %s: " X "\n",			\
87			    __func__, ## __VA_ARGS__);			\
88		}							\
89	} while (0)
90
91#define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
92#define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
93#define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
94#define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
95
96static void
97icl_conn_fail(struct icl_conn *ic)
98{
99	if (ic->ic_socket == NULL)
100		return;
101
102	/*
103	 * XXX
104	 */
105	ic->ic_socket->so_error = EDOOFUS;
106	(ic->ic_error)(ic);
107}
108
109static struct mbuf *
110icl_conn_receive(struct icl_conn *ic, size_t len)
111{
112	struct uio uio;
113	struct socket *so;
114	struct mbuf *m;
115	int error, flags;
116
117	so = ic->ic_socket;
118
119	memset(&uio, 0, sizeof(uio));
120	uio.uio_resid = len;
121
122	flags = MSG_DONTWAIT;
123	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
124	if (error != 0) {
125		ICL_DEBUG("soreceive error %d", error);
126		return (NULL);
127	}
128	if (uio.uio_resid != 0) {
129		m_freem(m);
130		ICL_DEBUG("short read");
131		return (NULL);
132	}
133
134	return (m);
135}
136
137static struct icl_pdu *
138icl_pdu_new(struct icl_conn *ic, int flags)
139{
140	struct icl_pdu *ip;
141
142#ifdef DIAGNOSTIC
143	refcount_acquire(&ic->ic_outstanding_pdus);
144#endif
145	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
146	if (ip == NULL) {
147		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
148#ifdef DIAGNOSTIC
149		refcount_release(&ic->ic_outstanding_pdus);
150#endif
151		return (NULL);
152	}
153
154	ip->ip_conn = ic;
155
156	return (ip);
157}
158
159void
160icl_pdu_free(struct icl_pdu *ip)
161{
162	struct icl_conn *ic;
163
164	ic = ip->ip_conn;
165
166	m_freem(ip->ip_bhs_mbuf);
167	m_freem(ip->ip_ahs_mbuf);
168	m_freem(ip->ip_data_mbuf);
169	uma_zfree(icl_pdu_zone, ip);
170#ifdef DIAGNOSTIC
171	refcount_release(&ic->ic_outstanding_pdus);
172#endif
173}
174
175/*
176 * Allocate icl_pdu with empty BHS to fill up by the caller.
177 */
178struct icl_pdu *
179icl_pdu_new_bhs(struct icl_conn *ic, int flags)
180{
181	struct icl_pdu *ip;
182
183	ip = icl_pdu_new(ic, flags);
184	if (ip == NULL)
185		return (NULL);
186
187	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
188	    flags, MT_DATA, M_PKTHDR);
189	if (ip->ip_bhs_mbuf == NULL) {
190		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
191		icl_pdu_free(ip);
192		return (NULL);
193	}
194	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
195	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
196	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
197
198	return (ip);
199}
200
201static int
202icl_pdu_ahs_length(const struct icl_pdu *request)
203{
204
205	return (request->ip_bhs->bhs_total_ahs_len * 4);
206}
207
208size_t
209icl_pdu_data_segment_length(const struct icl_pdu *request)
210{
211	uint32_t len = 0;
212
213	len += request->ip_bhs->bhs_data_segment_len[0];
214	len <<= 8;
215	len += request->ip_bhs->bhs_data_segment_len[1];
216	len <<= 8;
217	len += request->ip_bhs->bhs_data_segment_len[2];
218
219	return (len);
220}
221
222static void
223icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
224{
225
226	response->ip_bhs->bhs_data_segment_len[2] = len;
227	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
228	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
229}
230
231static size_t
232icl_pdu_padding(const struct icl_pdu *ip)
233{
234
235	if ((ip->ip_data_len % 4) != 0)
236		return (4 - (ip->ip_data_len % 4));
237
238	return (0);
239}
240
241static size_t
242icl_pdu_size(const struct icl_pdu *response)
243{
244	size_t len;
245
246	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
247
248	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
249	    icl_pdu_padding(response);
250	if (response->ip_conn->ic_header_crc32c)
251		len += ISCSI_HEADER_DIGEST_SIZE;
252	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
253		len += ISCSI_DATA_DIGEST_SIZE;
254
255	return (len);
256}
257
258static int
259icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
260{
261	struct mbuf *m;
262
263	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
264	if (m == NULL) {
265		ICL_DEBUG("failed to receive BHS");
266		return (-1);
267	}
268
269	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
270	if (request->ip_bhs_mbuf == NULL) {
271		ICL_WARN("m_pullup failed");
272		return (-1);
273	}
274	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
275
276	/*
277	 * XXX: For architectures with strict alignment requirements
278	 * 	we may need to allocate ip_bhs and copy the data into it.
279	 * 	For some reason, though, not doing this doesn't seem
280	 * 	to cause problems; tested on sparc64.
281	 */
282
283	*availablep -= sizeof(struct iscsi_bhs);
284	return (0);
285}
286
287static int
288icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
289{
290
291	request->ip_ahs_len = icl_pdu_ahs_length(request);
292	if (request->ip_ahs_len == 0)
293		return (0);
294
295	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
296	    request->ip_ahs_len);
297	if (request->ip_ahs_mbuf == NULL) {
298		ICL_DEBUG("failed to receive AHS");
299		return (-1);
300	}
301
302	*availablep -= request->ip_ahs_len;
303	return (0);
304}
305
306static uint32_t
307icl_mbuf_to_crc32c(const struct mbuf *m0)
308{
309	uint32_t digest = 0xffffffff;
310	const struct mbuf *m;
311
312	for (m = m0; m != NULL; m = m->m_next)
313		digest = calculate_crc32c(digest,
314		    mtod(m, const void *), m->m_len);
315
316	digest = digest ^ 0xffffffff;
317
318	return (digest);
319}
320
321static int
322icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
323{
324	struct mbuf *m;
325	uint32_t received_digest, valid_digest;
326
327	if (request->ip_conn->ic_header_crc32c == false)
328		return (0);
329
330	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
331	if (m == NULL) {
332		ICL_DEBUG("failed to receive header digest");
333		return (-1);
334	}
335
336	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
337	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
338	m_freem(m);
339
340	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
341
342	/*
343	 * XXX: Handle AHS.
344	 */
345	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
346	if (received_digest != valid_digest) {
347		ICL_WARN("header digest check failed; got 0x%x, "
348		    "should be 0x%x", received_digest, valid_digest);
349		return (-1);
350	}
351
352	return (0);
353}
354
355/*
356 * Return the number of bytes that should be waiting in the receive socket
357 * before icl_pdu_receive_data_segment() gets called.
358 */
359static size_t
360icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
361{
362	size_t len;
363
364	len = icl_pdu_data_segment_length(request);
365	if (len == 0)
366		return (0);
367
368	/*
369	 * Account for the parts of data segment already read from
370	 * the socket buffer.
371	 */
372	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
373	len -= request->ip_data_len;
374
375	/*
376	 * Don't always wait for the full data segment to be delivered
377	 * to the socket; this might badly affect performance due to
378	 * TCP window scaling.
379	 */
380	if (len > partial_receive_len) {
381#if 0
382		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
383		    len, partial_receive_len));
384#endif
385		len = partial_receive_len;
386
387		return (len);
388	}
389
390	/*
391	 * Account for padding.  Note that due to the way code is written,
392	 * the icl_pdu_receive_data_segment() must always receive padding
393	 * along with the last part of data segment, because it would be
394	 * impossible to tell whether we've already received the full data
395	 * segment including padding, or without it.
396	 */
397	if ((len % 4) != 0)
398		len += 4 - (len % 4);
399
400#if 0
401	ICL_DEBUG("need %zd bytes of data", len));
402#endif
403
404	return (len);
405}
406
407static int
408icl_pdu_receive_data_segment(struct icl_pdu *request,
409    size_t *availablep, bool *more_neededp)
410{
411	struct icl_conn *ic;
412	size_t len, padding = 0;
413	struct mbuf *m;
414
415	ic = request->ip_conn;
416
417	*more_neededp = false;
418	ic->ic_receive_len = 0;
419
420	len = icl_pdu_data_segment_length(request);
421	if (len == 0)
422		return (0);
423
424	if ((len % 4) != 0)
425		padding = 4 - (len % 4);
426
427	/*
428	 * Account for already received parts of data segment.
429	 */
430	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
431	len -= request->ip_data_len;
432
433	if (len + padding > *availablep) {
434		/*
435		 * Not enough data in the socket buffer.  Receive as much
436		 * as we can.  Don't receive padding, since, obviously, it's
437		 * not the end of data segment yet.
438		 */
439#if 0
440		ICL_DEBUG("limited from %zd to %zd",
441		    len + padding, *availablep - padding));
442#endif
443		len = *availablep - padding;
444		*more_neededp = true;
445		padding = 0;
446	}
447
448	/*
449	 * Must not try to receive padding without at least one byte
450	 * of actual data segment.
451	 */
452	if (len > 0) {
453		m = icl_conn_receive(request->ip_conn, len + padding);
454		if (m == NULL) {
455			ICL_DEBUG("failed to receive data segment");
456			return (-1);
457		}
458
459		if (request->ip_data_mbuf == NULL)
460			request->ip_data_mbuf = m;
461		else
462			m_cat(request->ip_data_mbuf, m);
463
464		request->ip_data_len += len;
465		*availablep -= len + padding;
466	} else
467		ICL_DEBUG("len 0");
468
469	if (*more_neededp)
470		ic->ic_receive_len =
471		    icl_pdu_data_segment_receive_len(request);
472
473	return (0);
474}
475
476static int
477icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
478{
479	struct mbuf *m;
480	uint32_t received_digest, valid_digest;
481
482	if (request->ip_conn->ic_data_crc32c == false)
483		return (0);
484
485	if (request->ip_data_len == 0)
486		return (0);
487
488	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
489	if (m == NULL) {
490		ICL_DEBUG("failed to receive data digest");
491		return (-1);
492	}
493
494	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
495	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
496	m_freem(m);
497
498	*availablep -= ISCSI_DATA_DIGEST_SIZE;
499
500	/*
501	 * Note that ip_data_mbuf also contains padding; since digest
502	 * calculation is supposed to include that, we iterate over
503	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
504	 */
505	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
506	if (received_digest != valid_digest) {
507		ICL_WARN("data digest check failed; got 0x%x, "
508		    "should be 0x%x", received_digest, valid_digest);
509		return (-1);
510	}
511
512	return (0);
513}
514
515/*
516 * Somewhat contrary to the name, this attempts to receive only one
517 * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
518 */
519static struct icl_pdu *
520icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
521{
522	struct icl_pdu *request;
523	struct socket *so;
524	size_t len;
525	int error;
526	bool more_needed;
527
528	so = ic->ic_socket;
529
530	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
531		KASSERT(ic->ic_receive_pdu == NULL,
532		    ("ic->ic_receive_pdu != NULL"));
533		request = icl_pdu_new(ic, M_NOWAIT);
534		if (request == NULL) {
535			ICL_DEBUG("failed to allocate PDU; "
536			    "dropping connection");
537			icl_conn_fail(ic);
538			return (NULL);
539		}
540		ic->ic_receive_pdu = request;
541	} else {
542		KASSERT(ic->ic_receive_pdu != NULL,
543		    ("ic->ic_receive_pdu == NULL"));
544		request = ic->ic_receive_pdu;
545	}
546
547	if (*availablep < ic->ic_receive_len) {
548#if 0
549		ICL_DEBUG("not enough data; need %zd, "
550		    "have %zd", ic->ic_receive_len, *availablep);
551#endif
552		return (NULL);
553	}
554
555	switch (ic->ic_receive_state) {
556	case ICL_CONN_STATE_BHS:
557		//ICL_DEBUG("receiving BHS");
558		error = icl_pdu_receive_bhs(request, availablep);
559		if (error != 0) {
560			ICL_DEBUG("failed to receive BHS; "
561			    "dropping connection");
562			break;
563		}
564
565		/*
566		 * We don't enforce any limit for AHS length;
567		 * its length is stored in 8 bit field.
568		 */
569
570		len = icl_pdu_data_segment_length(request);
571		if (len > ic->ic_max_data_segment_length) {
572			ICL_WARN("received data segment "
573			    "length %zd is larger than negotiated "
574			    "MaxDataSegmentLength %zd; "
575			    "dropping connection",
576			    len, ic->ic_max_data_segment_length);
577			error = EINVAL;
578			break;
579		}
580
581		ic->ic_receive_state = ICL_CONN_STATE_AHS;
582		ic->ic_receive_len = icl_pdu_ahs_length(request);
583		break;
584
585	case ICL_CONN_STATE_AHS:
586		//ICL_DEBUG("receiving AHS");
587		error = icl_pdu_receive_ahs(request, availablep);
588		if (error != 0) {
589			ICL_DEBUG("failed to receive AHS; "
590			    "dropping connection");
591			break;
592		}
593		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
594		if (ic->ic_header_crc32c == false)
595			ic->ic_receive_len = 0;
596		else
597			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
598		break;
599
600	case ICL_CONN_STATE_HEADER_DIGEST:
601		//ICL_DEBUG("receiving header digest");
602		error = icl_pdu_check_header_digest(request, availablep);
603		if (error != 0) {
604			ICL_DEBUG("header digest failed; "
605			    "dropping connection");
606			break;
607		}
608
609		ic->ic_receive_state = ICL_CONN_STATE_DATA;
610		ic->ic_receive_len =
611		    icl_pdu_data_segment_receive_len(request);
612		break;
613
614	case ICL_CONN_STATE_DATA:
615		//ICL_DEBUG("receiving data segment");
616		error = icl_pdu_receive_data_segment(request, availablep,
617		    &more_needed);
618		if (error != 0) {
619			ICL_DEBUG("failed to receive data segment;"
620			    "dropping connection");
621			break;
622		}
623
624		if (more_needed)
625			break;
626
627		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
628		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
629			ic->ic_receive_len = 0;
630		else
631			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
632		break;
633
634	case ICL_CONN_STATE_DATA_DIGEST:
635		//ICL_DEBUG("receiving data digest");
636		error = icl_pdu_check_data_digest(request, availablep);
637		if (error != 0) {
638			ICL_DEBUG("data digest failed; "
639			    "dropping connection");
640			break;
641		}
642
643		/*
644		 * We've received complete PDU; reset the receive state machine
645		 * and return the PDU.
646		 */
647		ic->ic_receive_state = ICL_CONN_STATE_BHS;
648		ic->ic_receive_len = sizeof(struct iscsi_bhs);
649		ic->ic_receive_pdu = NULL;
650		return (request);
651
652	default:
653		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
654	}
655
656	if (error != 0) {
657		icl_pdu_free(request);
658		icl_conn_fail(ic);
659	}
660
661	return (NULL);
662}
663
664static void
665icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
666{
667	struct icl_pdu *response;
668	struct socket *so;
669
670	so = ic->ic_socket;
671
672	/*
673	 * This can never happen; we're careful to only mess with ic->ic_socket
674	 * pointer when the send/receive threads are not running.
675	 */
676	KASSERT(so != NULL, ("NULL socket"));
677
678	for (;;) {
679		if (ic->ic_disconnecting)
680			return;
681
682		if (so->so_error != 0) {
683			ICL_DEBUG("connection error %d; "
684			    "dropping connection", so->so_error);
685			icl_conn_fail(ic);
686			return;
687		}
688
689		/*
690		 * Loop until we have a complete PDU or there is not enough
691		 * data in the socket buffer.
692		 */
693		if (available < ic->ic_receive_len) {
694#if 0
695			ICL_DEBUG("not enough data; have %zd, "
696			    "need %zd", available,
697			    ic->ic_receive_len);
698#endif
699			return;
700		}
701
702		response = icl_conn_receive_pdu(ic, &available);
703		if (response == NULL)
704			continue;
705
706		if (response->ip_ahs_len > 0) {
707			ICL_WARN("received PDU with unsupported "
708			    "AHS; opcode 0x%x; dropping connection",
709			    response->ip_bhs->bhs_opcode);
710			icl_pdu_free(response);
711			icl_conn_fail(ic);
712			return;
713		}
714
715		(ic->ic_receive)(response);
716	}
717}
718
719static void
720icl_receive_thread(void *arg)
721{
722	struct icl_conn *ic;
723	size_t available;
724	struct socket *so;
725
726	ic = arg;
727	so = ic->ic_socket;
728
729	ICL_CONN_LOCK(ic);
730	ic->ic_receive_running = true;
731	ICL_CONN_UNLOCK(ic);
732
733	for (;;) {
734		if (ic->ic_disconnecting) {
735			//ICL_DEBUG("terminating");
736			break;
737		}
738
739		SOCKBUF_LOCK(&so->so_rcv);
740		available = so->so_rcv.sb_cc;
741		if (available < ic->ic_receive_len) {
742			so->so_rcv.sb_lowat = ic->ic_receive_len;
743			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
744		}
745		SOCKBUF_UNLOCK(&so->so_rcv);
746
747		icl_conn_receive_pdus(ic, available);
748	}
749
750	ICL_CONN_LOCK(ic);
751	ic->ic_receive_running = false;
752	ICL_CONN_UNLOCK(ic);
753	kthread_exit();
754}
755
756static int
757icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
758{
759	struct icl_conn *ic;
760
761	ic = arg;
762	cv_signal(&ic->ic_receive_cv);
763	return (SU_OK);
764}
765
766static int
767icl_pdu_send(struct icl_pdu *request)
768{
769	size_t padding, pdu_len;
770	uint32_t digest, zero = 0;
771	int error, ok;
772	struct socket *so;
773	struct icl_conn *ic;
774
775	ic = request->ip_conn;
776	so = request->ip_conn->ic_socket;
777
778	ICL_CONN_LOCK_ASSERT(ic);
779
780	icl_pdu_set_data_segment_length(request, request->ip_data_len);
781
782	pdu_len = icl_pdu_size(request);
783
784	if (ic->ic_header_crc32c) {
785		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
786		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
787		    (void *)&digest);
788		if (ok != 1) {
789			ICL_WARN("failed to append header digest");
790			return (1);
791		}
792	}
793
794	if (request->ip_data_len != 0) {
795		padding = icl_pdu_padding(request);
796		if (padding > 0) {
797			ok = m_append(request->ip_data_mbuf, padding,
798			    (void *)&zero);
799			if (ok != 1) {
800				ICL_WARN("failed to append padding");
801				return (1);
802			}
803		}
804
805		if (ic->ic_data_crc32c) {
806			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
807
808			ok = m_append(request->ip_data_mbuf, sizeof(digest),
809			    (void *)&digest);
810			if (ok != 1) {
811				ICL_WARN("failed to append header digest");
812				return (1);
813			}
814		}
815
816		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
817		request->ip_data_mbuf = NULL;
818	}
819
820	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
821
822	error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
823	    NULL, MSG_DONTWAIT, curthread);
824	request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
825	if (error != 0) {
826		ICL_DEBUG("sosend error %d", error);
827		return (error);
828	}
829
830	return (0);
831}
832
833static void
834icl_conn_send_pdus(struct icl_conn *ic)
835{
836	struct icl_pdu *request;
837	struct socket *so;
838	size_t available, size;
839	int error;
840
841	ICL_CONN_LOCK_ASSERT(ic);
842
843	so = ic->ic_socket;
844
845	SOCKBUF_LOCK(&so->so_snd);
846	available = sbspace(&so->so_snd);
847	SOCKBUF_UNLOCK(&so->so_snd);
848
849	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
850		if (ic->ic_disconnecting)
851			return;
852
853		request = TAILQ_FIRST(&ic->ic_to_send);
854		size = icl_pdu_size(request);
855		if (available < size) {
856			/*
857			 * Set the low watermark on the socket,
858			 * to avoid waking up until there is enough
859			 * space.
860			 */
861			SOCKBUF_LOCK(&so->so_snd);
862			so->so_snd.sb_lowat = size;
863			SOCKBUF_UNLOCK(&so->so_snd);
864#if 1
865			ICL_DEBUG("no space to send; "
866			    "have %zd, need %zd",
867			    available, size);
868#endif
869			return;
870		}
871		available -= size;
872		TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
873		error = icl_pdu_send(request);
874		if (error != 0) {
875			ICL_DEBUG("failed to send PDU; "
876			    "dropping connection");
877			icl_conn_fail(ic);
878			return;
879		}
880		icl_pdu_free(request);
881	}
882}
883
884static void
885icl_send_thread(void *arg)
886{
887	struct icl_conn *ic;
888
889	ic = arg;
890
891	ICL_CONN_LOCK(ic);
892	ic->ic_send_running = true;
893
894	for (;;) {
895		if (ic->ic_disconnecting) {
896			//ICL_DEBUG("terminating");
897			break;
898		}
899		icl_conn_send_pdus(ic);
900		cv_wait(&ic->ic_send_cv, ic->ic_lock);
901	}
902
903	ic->ic_send_running = false;
904	ICL_CONN_UNLOCK(ic);
905	kthread_exit();
906}
907
908static int
909icl_soupcall_send(struct socket *so, void *arg, int waitflag)
910{
911	struct icl_conn *ic;
912
913	ic = arg;
914	cv_signal(&ic->ic_send_cv);
915	return (SU_OK);
916}
917
918int
919icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
920{
921	struct mbuf *mb, *newmb;
922	size_t copylen, off = 0;
923
924	KASSERT(len > 0, ("len == 0"));
925
926	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
927	if (newmb == NULL) {
928		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
929		return (ENOMEM);
930	}
931
932	for (mb = newmb; mb != NULL; mb = mb->m_next) {
933		copylen = min(M_TRAILINGSPACE(mb), len - off);
934		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
935		mb->m_len = copylen;
936		off += copylen;
937	}
938	KASSERT(off == len, ("%s: off != len", __func__));
939
940	if (request->ip_data_mbuf == NULL) {
941		request->ip_data_mbuf = newmb;
942		request->ip_data_len = len;
943	} else {
944		m_cat(request->ip_data_mbuf, newmb);
945		request->ip_data_len += len;
946	}
947
948	return (0);
949}
950
951void
952icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
953{
954
955	m_copydata(ip->ip_data_mbuf, off, len, addr);
956}
957
958void
959icl_pdu_queue(struct icl_pdu *ip)
960{
961	struct icl_conn *ic;
962
963	ic = ip->ip_conn;
964
965	ICL_CONN_LOCK_ASSERT(ic);
966
967	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
968		ICL_DEBUG("icl_pdu_queue on closed connection");
969		icl_pdu_free(ip);
970		return;
971	}
972	TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
973	cv_signal(&ic->ic_send_cv);
974}
975
976struct icl_conn *
977icl_conn_new(const char *name, struct mtx *lock)
978{
979	struct icl_conn *ic;
980
981	refcount_acquire(&icl_ncons);
982
983	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
984
985	TAILQ_INIT(&ic->ic_to_send);
986	ic->ic_lock = lock;
987	cv_init(&ic->ic_send_cv, "icl_tx");
988	cv_init(&ic->ic_receive_cv, "icl_rx");
989#ifdef DIAGNOSTIC
990	refcount_init(&ic->ic_outstanding_pdus, 0);
991#endif
992	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
993	ic->ic_name = name;
994
995	return (ic);
996}
997
998void
999icl_conn_free(struct icl_conn *ic)
1000{
1001
1002	cv_destroy(&ic->ic_send_cv);
1003	cv_destroy(&ic->ic_receive_cv);
1004	uma_zfree(icl_conn_zone, ic);
1005	refcount_release(&icl_ncons);
1006}
1007
1008static int
1009icl_conn_start(struct icl_conn *ic)
1010{
1011	size_t bufsize;
1012	struct sockopt opt;
1013	int error, one = 1;
1014
1015	ICL_CONN_LOCK(ic);
1016
1017	/*
1018	 * XXX: Ugly hack.
1019	 */
1020	if (ic->ic_socket == NULL) {
1021		ICL_CONN_UNLOCK(ic);
1022		return (EINVAL);
1023	}
1024
1025	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1026	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1027	ic->ic_disconnecting = false;
1028
1029	ICL_CONN_UNLOCK(ic);
1030
1031	/*
1032	 * Use max available sockbuf size for sending.  Do it manually
1033	 * instead of sbreserve(9) to work around resource limits.
1034	 *
1035	 * XXX: This kind of sucks.  On one hand, we don't currently support
1036	 *	sending a part of data segment; we always do it in one piece,
1037	 *	so we have to make sure it can fit in the socket buffer.
1038	 *	Once I've implemented partial send, we'll get rid of this
1039	 *	and use autoscaling.
1040	 */
1041        bufsize = (sizeof(struct iscsi_bhs) +
1042            ic->ic_max_data_segment_length) * 8;
1043	error = soreserve(ic->ic_socket, bufsize, bufsize);
1044	if (error != 0) {
1045		ICL_WARN("soreserve failed with error %d", error);
1046		icl_conn_close(ic);
1047		return (error);
1048	}
1049
1050	/*
1051	 * Disable Nagle.
1052	 */
1053	bzero(&opt, sizeof(opt));
1054	opt.sopt_dir = SOPT_SET;
1055	opt.sopt_level = IPPROTO_TCP;
1056	opt.sopt_name = TCP_NODELAY;
1057	opt.sopt_val = &one;
1058	opt.sopt_valsize = sizeof(one);
1059	error = sosetopt(ic->ic_socket, &opt);
1060	if (error != 0) {
1061		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1062		icl_conn_close(ic);
1063		return (error);
1064	}
1065
1066	/*
1067	 * Start threads.
1068	 */
1069	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1070	    ic->ic_name);
1071	if (error != 0) {
1072		ICL_WARN("kthread_add(9) failed with error %d", error);
1073		icl_conn_close(ic);
1074		return (error);
1075	}
1076
1077	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1078	    ic->ic_name);
1079	if (error != 0) {
1080		ICL_WARN("kthread_add(9) failed with error %d", error);
1081		icl_conn_close(ic);
1082		return (error);
1083	}
1084
1085	/*
1086	 * Register socket upcall, to get notified about incoming PDUs
1087	 * and free space to send outgoing ones.
1088	 */
1089	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1090	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1091	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1092	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1093	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1094	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1095
1096	return (0);
1097}
1098
1099int
1100icl_conn_handoff(struct icl_conn *ic, int fd)
1101{
1102	struct file *fp;
1103	struct socket *so;
1104	cap_rights_t rights;
1105	int error;
1106
1107	ICL_CONN_LOCK_ASSERT_NOT(ic);
1108
1109	/*
1110	 * Steal the socket from userland.
1111	 */
1112	error = fget(curthread, fd,
1113	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1114	if (error != 0)
1115		return (error);
1116	if (fp->f_type != DTYPE_SOCKET) {
1117		fdrop(fp, curthread);
1118		return (EINVAL);
1119	}
1120	so = fp->f_data;
1121	if (so->so_type != SOCK_STREAM) {
1122		fdrop(fp, curthread);
1123		return (EINVAL);
1124	}
1125
1126	ICL_CONN_LOCK(ic);
1127
1128	if (ic->ic_socket != NULL) {
1129		ICL_CONN_UNLOCK(ic);
1130		fdrop(fp, curthread);
1131		return (EBUSY);
1132	}
1133
1134	ic->ic_socket = fp->f_data;
1135	fp->f_ops = &badfileops;
1136	fp->f_data = NULL;
1137	fdrop(fp, curthread);
1138	ICL_CONN_UNLOCK(ic);
1139
1140	error = icl_conn_start(ic);
1141
1142	return (error);
1143}
1144
1145void
1146icl_conn_shutdown(struct icl_conn *ic)
1147{
1148	ICL_CONN_LOCK_ASSERT_NOT(ic);
1149
1150	ICL_CONN_LOCK(ic);
1151	if (ic->ic_socket == NULL) {
1152		ICL_CONN_UNLOCK(ic);
1153		return;
1154	}
1155	ICL_CONN_UNLOCK(ic);
1156
1157	soshutdown(ic->ic_socket, SHUT_RDWR);
1158}
1159
1160void
1161icl_conn_close(struct icl_conn *ic)
1162{
1163	struct icl_pdu *pdu;
1164
1165	ICL_CONN_LOCK_ASSERT_NOT(ic);
1166
1167	ICL_CONN_LOCK(ic);
1168	if (ic->ic_socket == NULL) {
1169		ICL_CONN_UNLOCK(ic);
1170		return;
1171	}
1172
1173	ic->ic_disconnecting = true;
1174
1175	/*
1176	 * Wake up the threads, so they can properly terminate.
1177	 */
1178	cv_signal(&ic->ic_receive_cv);
1179	cv_signal(&ic->ic_send_cv);
1180	while (ic->ic_receive_running || ic->ic_send_running) {
1181		//ICL_DEBUG("waiting for send/receive threads to terminate");
1182		ICL_CONN_UNLOCK(ic);
1183		cv_signal(&ic->ic_receive_cv);
1184		cv_signal(&ic->ic_send_cv);
1185		pause("icl_close", 1 * hz);
1186		ICL_CONN_LOCK(ic);
1187	}
1188	//ICL_DEBUG("send/receive threads terminated");
1189
1190	soclose(ic->ic_socket);
1191	ic->ic_socket = NULL;
1192
1193	if (ic->ic_receive_pdu != NULL) {
1194		//ICL_DEBUG("freeing partially received PDU");
1195		icl_pdu_free(ic->ic_receive_pdu);
1196		ic->ic_receive_pdu = NULL;
1197	}
1198
1199	/*
1200	 * Remove any outstanding PDUs from the send queue.
1201	 */
1202	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1203		pdu = TAILQ_FIRST(&ic->ic_to_send);
1204		TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1205		icl_pdu_free(pdu);
1206	}
1207
1208	KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1209	    ("destroying session with non-empty send queue"));
1210	/*
1211	 * XXX
1212	 */
1213#if 0
1214	KASSERT(ic->ic_outstanding_pdus == 0,
1215	    ("destroying session with %d outstanding PDUs",
1216	     ic->ic_outstanding_pdus));
1217#endif
1218	ICL_CONN_UNLOCK(ic);
1219}
1220
1221bool
1222icl_conn_connected(struct icl_conn *ic)
1223{
1224	ICL_CONN_LOCK_ASSERT_NOT(ic);
1225
1226	ICL_CONN_LOCK(ic);
1227	if (ic->ic_socket == NULL) {
1228		ICL_CONN_UNLOCK(ic);
1229		return (false);
1230	}
1231	if (ic->ic_socket->so_error != 0) {
1232		ICL_CONN_UNLOCK(ic);
1233		return (false);
1234	}
1235	ICL_CONN_UNLOCK(ic);
1236	return (true);
1237}
1238
1239#ifdef ICL_KERNEL_PROXY
1240int
1241icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1242{
1243	int error;
1244
1245	ICL_CONN_LOCK_ASSERT_NOT(ic);
1246
1247	if (so->so_type != SOCK_STREAM)
1248		return (EINVAL);
1249
1250	ICL_CONN_LOCK(ic);
1251	if (ic->ic_socket != NULL) {
1252		ICL_CONN_UNLOCK(ic);
1253		return (EBUSY);
1254	}
1255	ic->ic_socket = so;
1256	ICL_CONN_UNLOCK(ic);
1257
1258	error = icl_conn_start(ic);
1259
1260	return (error);
1261}
1262#endif /* ICL_KERNEL_PROXY */
1263
1264static int
1265icl_unload(void)
1266{
1267
1268	if (icl_ncons != 0)
1269		return (EBUSY);
1270
1271	uma_zdestroy(icl_conn_zone);
1272	uma_zdestroy(icl_pdu_zone);
1273
1274	return (0);
1275}
1276
1277static void
1278icl_load(void)
1279{
1280
1281	icl_conn_zone = uma_zcreate("icl_conn",
1282	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1283	    UMA_ALIGN_PTR, 0);
1284	icl_pdu_zone = uma_zcreate("icl_pdu",
1285	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1286	    UMA_ALIGN_PTR, 0);
1287
1288	refcount_init(&icl_ncons, 0);
1289}
1290
1291static int
1292icl_modevent(module_t mod, int what, void *arg)
1293{
1294
1295	switch (what) {
1296	case MOD_LOAD:
1297		icl_load();
1298		return (0);
1299	case MOD_UNLOAD:
1300		return (icl_unload());
1301	default:
1302		return (EINVAL);
1303	}
1304}
1305
1306moduledata_t icl_data = {
1307	"icl",
1308	icl_modevent,
1309	0
1310};
1311
1312DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1313MODULE_VERSION(icl, 1);
1314