icl_soft.c revision 263743
1/*-
2 * Copyright (c) 2012 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/dev/iscsi/icl.c 263743 2014-03-25 19:17:22Z trasz $
30 */
31
32/*
33 * iSCSI Common Layer.  It's used by both the initiator and target to send
34 * and receive iSCSI PDUs.
35 */
36
37#include <sys/param.h>
38#include <sys/capsicum.h>
39#include <sys/condvar.h>
40#include <sys/conf.h>
41#include <sys/file.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/mbuf.h>
46#include <sys/mutex.h>
47#include <sys/module.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/sx.h>
53#include <sys/uio.h>
54#include <vm/uma.h>
55#include <netinet/in.h>
56#include <netinet/tcp.h>
57
58#include "icl.h"
59#include "iscsi_proto.h"
60
61SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62static int debug = 1;
63TUNABLE_INT("kern.icl.debug", &debug);
64SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65    &debug, 1, "Enable debug messages");
66static int partial_receive_len = 1 * 1024; /* XXX: More? */
67TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69    &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70    "data segment");
71
72static uma_zone_t icl_conn_zone;
73static uma_zone_t icl_pdu_zone;
74
75static volatile u_int	icl_ncons;
76
77#define	ICL_DEBUG(X, ...)						\
78	do {								\
79		if (debug > 1)						\
80			printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
81	} while (0)
82
83#define	ICL_WARN(X, ...)						\
84	do {								\
85		if (debug > 0) {					\
86			printf("WARNING: %s: " X "\n",			\
87			    __func__, ## __VA_ARGS__);			\
88		}							\
89	} while (0)
90
91#define ICL_CONN_LOCK(X)		mtx_lock(&X->ic_lock)
92#define ICL_CONN_UNLOCK(X)		mtx_unlock(&X->ic_lock)
93#define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(&X->ic_lock, MA_OWNED)
94
95static void
96icl_conn_fail(struct icl_conn *ic)
97{
98	if (ic->ic_socket == NULL)
99		return;
100
101	/*
102	 * XXX
103	 */
104	ic->ic_socket->so_error = EDOOFUS;
105	(ic->ic_error)(ic);
106}
107
108static struct mbuf *
109icl_conn_receive(struct icl_conn *ic, size_t len)
110{
111	struct uio uio;
112	struct socket *so;
113	struct mbuf *m;
114	int error, flags;
115
116	so = ic->ic_socket;
117
118	memset(&uio, 0, sizeof(uio));
119	uio.uio_resid = len;
120
121	flags = MSG_DONTWAIT;
122	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
123	if (error != 0) {
124		ICL_DEBUG("soreceive error %d", error);
125		return (NULL);
126	}
127	if (uio.uio_resid != 0) {
128		m_freem(m);
129		ICL_DEBUG("short read");
130		return (NULL);
131	}
132
133	return (m);
134}
135
136static struct icl_pdu *
137icl_pdu_new(struct icl_conn *ic, int flags)
138{
139	struct icl_pdu *ip;
140
141#ifdef DIAGNOSTIC
142	refcount_acquire(&ic->ic_outstanding_pdus);
143#endif
144	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
145	if (ip == NULL) {
146		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
147#ifdef DIAGNOSTIC
148		refcount_release(&ic->ic_outstanding_pdus);
149#endif
150		return (NULL);
151	}
152
153	ip->ip_conn = ic;
154
155	return (ip);
156}
157
158void
159icl_pdu_free(struct icl_pdu *ip)
160{
161	struct icl_conn *ic;
162
163	ic = ip->ip_conn;
164
165	m_freem(ip->ip_bhs_mbuf);
166	m_freem(ip->ip_ahs_mbuf);
167	m_freem(ip->ip_data_mbuf);
168	uma_zfree(icl_pdu_zone, ip);
169#ifdef DIAGNOSTIC
170	refcount_release(&ic->ic_outstanding_pdus);
171#endif
172}
173
174/*
175 * Allocate icl_pdu with empty BHS to fill up by the caller.
176 */
177struct icl_pdu *
178icl_pdu_new_bhs(struct icl_conn *ic, int flags)
179{
180	struct icl_pdu *ip;
181
182	ip = icl_pdu_new(ic, flags);
183	if (ip == NULL)
184		return (NULL);
185
186	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
187	    flags, MT_DATA, M_PKTHDR);
188	if (ip->ip_bhs_mbuf == NULL) {
189		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
190		icl_pdu_free(ip);
191		return (NULL);
192	}
193	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
194	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
195	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
196
197	return (ip);
198}
199
200static int
201icl_pdu_ahs_length(const struct icl_pdu *request)
202{
203
204	return (request->ip_bhs->bhs_total_ahs_len * 4);
205}
206
207size_t
208icl_pdu_data_segment_length(const struct icl_pdu *request)
209{
210	uint32_t len = 0;
211
212	len += request->ip_bhs->bhs_data_segment_len[0];
213	len <<= 8;
214	len += request->ip_bhs->bhs_data_segment_len[1];
215	len <<= 8;
216	len += request->ip_bhs->bhs_data_segment_len[2];
217
218	return (len);
219}
220
221static void
222icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
223{
224
225	response->ip_bhs->bhs_data_segment_len[2] = len;
226	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
227	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
228}
229
230static size_t
231icl_pdu_padding(const struct icl_pdu *ip)
232{
233
234	if ((ip->ip_data_len % 4) != 0)
235		return (4 - (ip->ip_data_len % 4));
236
237	return (0);
238}
239
240static size_t
241icl_pdu_size(const struct icl_pdu *response)
242{
243	size_t len;
244
245	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
246
247	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
248	    icl_pdu_padding(response);
249	if (response->ip_conn->ic_header_crc32c)
250		len += ISCSI_HEADER_DIGEST_SIZE;
251	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
252		len += ISCSI_DATA_DIGEST_SIZE;
253
254	return (len);
255}
256
257static int
258icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
259{
260	struct mbuf *m;
261
262	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
263	if (m == NULL) {
264		ICL_DEBUG("failed to receive BHS");
265		return (-1);
266	}
267
268	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
269	if (request->ip_bhs_mbuf == NULL) {
270		ICL_WARN("m_pullup failed");
271		return (-1);
272	}
273	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
274
275	/*
276	 * XXX: For architectures with strict alignment requirements
277	 * 	we may need to allocate ip_bhs and copy the data into it.
278	 * 	For some reason, though, not doing this doesn't seem
279	 * 	to cause problems; tested on sparc64.
280	 */
281
282	*availablep -= sizeof(struct iscsi_bhs);
283	return (0);
284}
285
286static int
287icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
288{
289
290	request->ip_ahs_len = icl_pdu_ahs_length(request);
291	if (request->ip_ahs_len == 0)
292		return (0);
293
294	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
295	    request->ip_ahs_len);
296	if (request->ip_ahs_mbuf == NULL) {
297		ICL_DEBUG("failed to receive AHS");
298		return (-1);
299	}
300
301	*availablep -= request->ip_ahs_len;
302	return (0);
303}
304
305static uint32_t
306icl_mbuf_to_crc32c(const struct mbuf *m0)
307{
308	uint32_t digest = 0xffffffff;
309	const struct mbuf *m;
310
311	for (m = m0; m != NULL; m = m->m_next)
312		digest = calculate_crc32c(digest,
313		    mtod(m, const void *), m->m_len);
314
315	digest = digest ^ 0xffffffff;
316
317	return (digest);
318}
319
320static int
321icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
322{
323	struct mbuf *m;
324	uint32_t received_digest, valid_digest;
325
326	if (request->ip_conn->ic_header_crc32c == false)
327		return (0);
328
329	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
330	if (m == NULL) {
331		ICL_DEBUG("failed to receive header digest");
332		return (-1);
333	}
334
335	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
336	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
337	m_freem(m);
338
339	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
340
341	/*
342	 * XXX: Handle AHS.
343	 */
344	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
345	if (received_digest != valid_digest) {
346		ICL_WARN("header digest check failed; got 0x%x, "
347		    "should be 0x%x", received_digest, valid_digest);
348		return (-1);
349	}
350
351	return (0);
352}
353
354/*
355 * Return the number of bytes that should be waiting in the receive socket
356 * before icl_pdu_receive_data_segment() gets called.
357 */
358static size_t
359icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
360{
361	size_t len;
362
363	len = icl_pdu_data_segment_length(request);
364	if (len == 0)
365		return (0);
366
367	/*
368	 * Account for the parts of data segment already read from
369	 * the socket buffer.
370	 */
371	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
372	len -= request->ip_data_len;
373
374	/*
375	 * Don't always wait for the full data segment to be delivered
376	 * to the socket; this might badly affect performance due to
377	 * TCP window scaling.
378	 */
379	if (len > partial_receive_len) {
380#if 0
381		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
382		    len, partial_receive_len));
383#endif
384		len = partial_receive_len;
385
386		return (len);
387	}
388
389	/*
390	 * Account for padding.  Note that due to the way code is written,
391	 * the icl_pdu_receive_data_segment() must always receive padding
392	 * along with the last part of data segment, because it would be
393	 * impossible to tell whether we've already received the full data
394	 * segment including padding, or without it.
395	 */
396	if ((len % 4) != 0)
397		len += 4 - (len % 4);
398
399#if 0
400	ICL_DEBUG("need %zd bytes of data", len));
401#endif
402
403	return (len);
404}
405
406static int
407icl_pdu_receive_data_segment(struct icl_pdu *request,
408    size_t *availablep, bool *more_neededp)
409{
410	struct icl_conn *ic;
411	size_t len, padding = 0;
412	struct mbuf *m;
413
414	ic = request->ip_conn;
415
416	*more_neededp = false;
417	ic->ic_receive_len = 0;
418
419	len = icl_pdu_data_segment_length(request);
420	if (len == 0)
421		return (0);
422
423	if ((len % 4) != 0)
424		padding = 4 - (len % 4);
425
426	/*
427	 * Account for already received parts of data segment.
428	 */
429	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
430	len -= request->ip_data_len;
431
432	if (len + padding > *availablep) {
433		/*
434		 * Not enough data in the socket buffer.  Receive as much
435		 * as we can.  Don't receive padding, since, obviously, it's
436		 * not the end of data segment yet.
437		 */
438#if 0
439		ICL_DEBUG("limited from %zd to %zd",
440		    len + padding, *availablep - padding));
441#endif
442		len = *availablep - padding;
443		*more_neededp = true;
444		padding = 0;
445	}
446
447	/*
448	 * Must not try to receive padding without at least one byte
449	 * of actual data segment.
450	 */
451	if (len > 0) {
452		m = icl_conn_receive(request->ip_conn, len + padding);
453		if (m == NULL) {
454			ICL_DEBUG("failed to receive data segment");
455			return (-1);
456		}
457
458		if (request->ip_data_mbuf == NULL)
459			request->ip_data_mbuf = m;
460		else
461			m_cat(request->ip_data_mbuf, m);
462
463		request->ip_data_len += len;
464		*availablep -= len + padding;
465	} else
466		ICL_DEBUG("len 0");
467
468	if (*more_neededp)
469		ic->ic_receive_len =
470		    icl_pdu_data_segment_receive_len(request);
471
472	return (0);
473}
474
475static int
476icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
477{
478	struct mbuf *m;
479	uint32_t received_digest, valid_digest;
480
481	if (request->ip_conn->ic_data_crc32c == false)
482		return (0);
483
484	if (request->ip_data_len == 0)
485		return (0);
486
487	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
488	if (m == NULL) {
489		ICL_DEBUG("failed to receive data digest");
490		return (-1);
491	}
492
493	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
494	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
495	m_freem(m);
496
497	*availablep -= ISCSI_DATA_DIGEST_SIZE;
498
499	/*
500	 * Note that ip_data_mbuf also contains padding; since digest
501	 * calculation is supposed to include that, we iterate over
502	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
503	 */
504	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
505	if (received_digest != valid_digest) {
506		ICL_WARN("data digest check failed; got 0x%x, "
507		    "should be 0x%x", received_digest, valid_digest);
508		return (-1);
509	}
510
511	return (0);
512}
513
514/*
515 * Somewhat contrary to the name, this attempts to receive only one
516 * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
517 */
518static struct icl_pdu *
519icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
520{
521	struct icl_pdu *request;
522	struct socket *so;
523	size_t len;
524	int error;
525	bool more_needed;
526
527	so = ic->ic_socket;
528
529	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
530		KASSERT(ic->ic_receive_pdu == NULL,
531		    ("ic->ic_receive_pdu != NULL"));
532		request = icl_pdu_new(ic, M_NOWAIT);
533		if (request == NULL) {
534			ICL_DEBUG("failed to allocate PDU; "
535			    "dropping connection");
536			icl_conn_fail(ic);
537			return (NULL);
538		}
539		ic->ic_receive_pdu = request;
540	} else {
541		KASSERT(ic->ic_receive_pdu != NULL,
542		    ("ic->ic_receive_pdu == NULL"));
543		request = ic->ic_receive_pdu;
544	}
545
546	if (*availablep < ic->ic_receive_len) {
547#if 0
548		ICL_DEBUG("not enough data; need %zd, "
549		    "have %zd", ic->ic_receive_len, *availablep);
550#endif
551		return (NULL);
552	}
553
554	switch (ic->ic_receive_state) {
555	case ICL_CONN_STATE_BHS:
556		//ICL_DEBUG("receiving BHS");
557		error = icl_pdu_receive_bhs(request, availablep);
558		if (error != 0) {
559			ICL_DEBUG("failed to receive BHS; "
560			    "dropping connection");
561			break;
562		}
563
564		/*
565		 * We don't enforce any limit for AHS length;
566		 * its length is stored in 8 bit field.
567		 */
568
569		len = icl_pdu_data_segment_length(request);
570		if (len > ic->ic_max_data_segment_length) {
571			ICL_WARN("received data segment "
572			    "length %zd is larger than negotiated "
573			    "MaxDataSegmentLength %zd; "
574			    "dropping connection",
575			    len, ic->ic_max_data_segment_length);
576			error = EINVAL;
577			break;
578		}
579
580		ic->ic_receive_state = ICL_CONN_STATE_AHS;
581		ic->ic_receive_len = icl_pdu_ahs_length(request);
582		break;
583
584	case ICL_CONN_STATE_AHS:
585		//ICL_DEBUG("receiving AHS");
586		error = icl_pdu_receive_ahs(request, availablep);
587		if (error != 0) {
588			ICL_DEBUG("failed to receive AHS; "
589			    "dropping connection");
590			break;
591		}
592		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
593		if (ic->ic_header_crc32c == false)
594			ic->ic_receive_len = 0;
595		else
596			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
597		break;
598
599	case ICL_CONN_STATE_HEADER_DIGEST:
600		//ICL_DEBUG("receiving header digest");
601		error = icl_pdu_check_header_digest(request, availablep);
602		if (error != 0) {
603			ICL_DEBUG("header digest failed; "
604			    "dropping connection");
605			break;
606		}
607
608		ic->ic_receive_state = ICL_CONN_STATE_DATA;
609		ic->ic_receive_len =
610		    icl_pdu_data_segment_receive_len(request);
611		break;
612
613	case ICL_CONN_STATE_DATA:
614		//ICL_DEBUG("receiving data segment");
615		error = icl_pdu_receive_data_segment(request, availablep,
616		    &more_needed);
617		if (error != 0) {
618			ICL_DEBUG("failed to receive data segment;"
619			    "dropping connection");
620			break;
621		}
622
623		if (more_needed)
624			break;
625
626		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
627		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
628			ic->ic_receive_len = 0;
629		else
630			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
631		break;
632
633	case ICL_CONN_STATE_DATA_DIGEST:
634		//ICL_DEBUG("receiving data digest");
635		error = icl_pdu_check_data_digest(request, availablep);
636		if (error != 0) {
637			ICL_DEBUG("data digest failed; "
638			    "dropping connection");
639			break;
640		}
641
642		/*
643		 * We've received complete PDU; reset the receive state machine
644		 * and return the PDU.
645		 */
646		ic->ic_receive_state = ICL_CONN_STATE_BHS;
647		ic->ic_receive_len = sizeof(struct iscsi_bhs);
648		ic->ic_receive_pdu = NULL;
649		return (request);
650
651	default:
652		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
653	}
654
655	if (error != 0) {
656		icl_pdu_free(request);
657		icl_conn_fail(ic);
658	}
659
660	return (NULL);
661}
662
663static void
664icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
665{
666	struct icl_pdu *response;
667	struct socket *so;
668
669	so = ic->ic_socket;
670
671	/*
672	 * This can never happen; we're careful to only mess with ic->ic_socket
673	 * pointer when the send/receive threads are not running.
674	 */
675	KASSERT(so != NULL, ("NULL socket"));
676
677	for (;;) {
678		if (ic->ic_disconnecting)
679			return;
680
681		if (so->so_error != 0) {
682			ICL_DEBUG("connection error %d; "
683			    "dropping connection", so->so_error);
684			icl_conn_fail(ic);
685			return;
686		}
687
688		/*
689		 * Loop until we have a complete PDU or there is not enough
690		 * data in the socket buffer.
691		 */
692		if (available < ic->ic_receive_len) {
693#if 0
694			ICL_DEBUG("not enough data; have %zd, "
695			    "need %zd", available,
696			    ic->ic_receive_len);
697#endif
698			return;
699		}
700
701		response = icl_conn_receive_pdu(ic, &available);
702		if (response == NULL)
703			continue;
704
705		if (response->ip_ahs_len > 0) {
706			ICL_WARN("received PDU with unsupported "
707			    "AHS; opcode 0x%x; dropping connection",
708			    response->ip_bhs->bhs_opcode);
709			icl_pdu_free(response);
710			icl_conn_fail(ic);
711			return;
712		}
713
714		(ic->ic_receive)(response);
715	}
716}
717
718static void
719icl_receive_thread(void *arg)
720{
721	struct icl_conn *ic;
722	size_t available;
723	struct socket *so;
724
725	ic = arg;
726	so = ic->ic_socket;
727
728	ICL_CONN_LOCK(ic);
729	ic->ic_receive_running = true;
730	ICL_CONN_UNLOCK(ic);
731
732	for (;;) {
733		if (ic->ic_disconnecting) {
734			//ICL_DEBUG("terminating");
735			break;
736		}
737
738		SOCKBUF_LOCK(&so->so_rcv);
739		available = so->so_rcv.sb_cc;
740		if (available < ic->ic_receive_len) {
741			so->so_rcv.sb_lowat = ic->ic_receive_len;
742			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
743		}
744		SOCKBUF_UNLOCK(&so->so_rcv);
745
746		icl_conn_receive_pdus(ic, available);
747	}
748
749	ICL_CONN_LOCK(ic);
750	ic->ic_receive_running = false;
751	ICL_CONN_UNLOCK(ic);
752	kthread_exit();
753}
754
755static int
756icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
757{
758	struct icl_conn *ic;
759
760	ic = arg;
761	cv_signal(&ic->ic_receive_cv);
762	return (SU_OK);
763}
764
765static int
766icl_pdu_send(struct icl_pdu *request)
767{
768	size_t padding, pdu_len;
769	uint32_t digest, zero = 0;
770	int error, ok;
771	struct socket *so;
772	struct icl_conn *ic;
773
774	ic = request->ip_conn;
775	so = request->ip_conn->ic_socket;
776
777	ICL_CONN_LOCK_ASSERT(ic);
778
779	icl_pdu_set_data_segment_length(request, request->ip_data_len);
780
781	pdu_len = icl_pdu_size(request);
782
783	if (ic->ic_header_crc32c) {
784		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
785		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
786		    (void *)&digest);
787		if (ok != 1) {
788			ICL_WARN("failed to append header digest");
789			return (1);
790		}
791	}
792
793	if (request->ip_data_len != 0) {
794		padding = icl_pdu_padding(request);
795		if (padding > 0) {
796			ok = m_append(request->ip_data_mbuf, padding,
797			    (void *)&zero);
798			if (ok != 1) {
799				ICL_WARN("failed to append padding");
800				return (1);
801			}
802		}
803
804		if (ic->ic_data_crc32c) {
805			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
806
807			ok = m_append(request->ip_data_mbuf, sizeof(digest),
808			    (void *)&digest);
809			if (ok != 1) {
810				ICL_WARN("failed to append header digest");
811				return (1);
812			}
813		}
814
815		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
816		request->ip_data_mbuf = NULL;
817	}
818
819	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
820
821	error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
822	    NULL, MSG_DONTWAIT, curthread);
823	request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
824	if (error != 0) {
825		ICL_DEBUG("sosend error %d", error);
826		return (error);
827	}
828
829	return (0);
830}
831
832static void
833icl_conn_send_pdus(struct icl_conn *ic)
834{
835	struct icl_pdu *request;
836	struct socket *so;
837	size_t available, size;
838	int error;
839
840	ICL_CONN_LOCK_ASSERT(ic);
841
842	so = ic->ic_socket;
843
844	SOCKBUF_LOCK(&so->so_snd);
845	available = sbspace(&so->so_snd);
846	SOCKBUF_UNLOCK(&so->so_snd);
847
848	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
849		if (ic->ic_disconnecting)
850			return;
851
852		request = TAILQ_FIRST(&ic->ic_to_send);
853		size = icl_pdu_size(request);
854		if (available < size) {
855			/*
856			 * Set the low watermark on the socket,
857			 * to avoid waking up until there is enough
858			 * space.
859			 */
860			SOCKBUF_LOCK(&so->so_snd);
861			so->so_snd.sb_lowat = size;
862			SOCKBUF_UNLOCK(&so->so_snd);
863#if 1
864			ICL_DEBUG("no space to send; "
865			    "have %zd, need %zd",
866			    available, size);
867#endif
868			return;
869		}
870		available -= size;
871		TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
872		error = icl_pdu_send(request);
873		if (error != 0) {
874			ICL_DEBUG("failed to send PDU; "
875			    "dropping connection");
876			icl_conn_fail(ic);
877			return;
878		}
879		icl_pdu_free(request);
880	}
881}
882
883static void
884icl_send_thread(void *arg)
885{
886	struct icl_conn *ic;
887
888	ic = arg;
889
890	ICL_CONN_LOCK(ic);
891	ic->ic_send_running = true;
892
893	for (;;) {
894		if (ic->ic_disconnecting) {
895			//ICL_DEBUG("terminating");
896			break;
897		}
898		icl_conn_send_pdus(ic);
899		cv_wait(&ic->ic_send_cv, &ic->ic_lock);
900	}
901
902	ic->ic_send_running = false;
903	ICL_CONN_UNLOCK(ic);
904	kthread_exit();
905}
906
907static int
908icl_soupcall_send(struct socket *so, void *arg, int waitflag)
909{
910	struct icl_conn *ic;
911
912	ic = arg;
913	cv_signal(&ic->ic_send_cv);
914	return (SU_OK);
915}
916
917int
918icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
919{
920	struct mbuf *mb, *newmb;
921	size_t copylen, off = 0;
922
923	KASSERT(len > 0, ("len == 0"));
924
925	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
926	if (newmb == NULL) {
927		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
928		return (ENOMEM);
929	}
930
931	for (mb = newmb; mb != NULL; mb = mb->m_next) {
932		copylen = min(M_TRAILINGSPACE(mb), len - off);
933		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
934		mb->m_len = copylen;
935		off += copylen;
936	}
937	KASSERT(off == len, ("%s: off != len", __func__));
938
939	if (request->ip_data_mbuf == NULL) {
940		request->ip_data_mbuf = newmb;
941		request->ip_data_len = len;
942	} else {
943		m_cat(request->ip_data_mbuf, newmb);
944		request->ip_data_len += len;
945	}
946
947	return (0);
948}
949
950void
951icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
952{
953
954	m_copydata(ip->ip_data_mbuf, off, len, addr);
955}
956
957void
958icl_pdu_queue(struct icl_pdu *ip)
959{
960	struct icl_conn *ic;
961
962	ic = ip->ip_conn;
963
964	ICL_CONN_LOCK(ic);
965	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
966		ICL_DEBUG("icl_pdu_queue on closed connection");
967		ICL_CONN_UNLOCK(ic);
968		icl_pdu_free(ip);
969		return;
970	}
971	TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
972	ICL_CONN_UNLOCK(ic);
973	cv_signal(&ic->ic_send_cv);
974}
975
976struct icl_conn *
977icl_conn_new(void)
978{
979	struct icl_conn *ic;
980
981	refcount_acquire(&icl_ncons);
982
983	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
984
985	TAILQ_INIT(&ic->ic_to_send);
986	mtx_init(&ic->ic_lock, "icl_lock", NULL, MTX_DEF);
987	cv_init(&ic->ic_send_cv, "icl_tx");
988	cv_init(&ic->ic_receive_cv, "icl_rx");
989#ifdef DIAGNOSTIC
990	refcount_init(&ic->ic_outstanding_pdus, 0);
991#endif
992	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
993
994	return (ic);
995}
996
997void
998icl_conn_free(struct icl_conn *ic)
999{
1000
1001	mtx_destroy(&ic->ic_lock);
1002	cv_destroy(&ic->ic_send_cv);
1003	cv_destroy(&ic->ic_receive_cv);
1004	uma_zfree(icl_conn_zone, ic);
1005	refcount_release(&icl_ncons);
1006}
1007
1008static int
1009icl_conn_start(struct icl_conn *ic)
1010{
1011	size_t bufsize;
1012	struct sockopt opt;
1013	int error, one = 1;
1014
1015	ICL_CONN_LOCK(ic);
1016
1017	/*
1018	 * XXX: Ugly hack.
1019	 */
1020	if (ic->ic_socket == NULL) {
1021		ICL_CONN_UNLOCK(ic);
1022		return (EINVAL);
1023	}
1024
1025	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1026	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1027	ic->ic_disconnecting = false;
1028
1029	ICL_CONN_UNLOCK(ic);
1030
1031	/*
1032	 * Use max available sockbuf size for sending.  Do it manually
1033	 * instead of sbreserve(9) to work around resource limits.
1034	 *
1035	 * XXX: This kind of sucks.  On one hand, we don't currently support
1036	 *	sending a part of data segment; we always do it in one piece,
1037	 *	so we have to make sure it can fit in the socket buffer.
1038	 *	Once I've implemented partial send, we'll get rid of this
1039	 *	and use autoscaling.
1040	 */
1041        bufsize = (sizeof(struct iscsi_bhs) +
1042            ic->ic_max_data_segment_length) * 8;
1043	error = soreserve(ic->ic_socket, bufsize, bufsize);
1044	if (error != 0) {
1045		ICL_WARN("soreserve failed with error %d", error);
1046		icl_conn_close(ic);
1047		return (error);
1048	}
1049
1050	/*
1051	 * Disable Nagle.
1052	 */
1053	bzero(&opt, sizeof(opt));
1054	opt.sopt_dir = SOPT_SET;
1055	opt.sopt_level = IPPROTO_TCP;
1056	opt.sopt_name = TCP_NODELAY;
1057	opt.sopt_val = &one;
1058	opt.sopt_valsize = sizeof(one);
1059	error = sosetopt(ic->ic_socket, &opt);
1060	if (error != 0) {
1061		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1062		icl_conn_close(ic);
1063		return (error);
1064	}
1065
1066	/*
1067	 * Start threads.
1068	 */
1069	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1070	if (error != 0) {
1071		ICL_WARN("kthread_add(9) failed with error %d", error);
1072		icl_conn_close(ic);
1073		return (error);
1074	}
1075
1076	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1077	if (error != 0) {
1078		ICL_WARN("kthread_add(9) failed with error %d", error);
1079		icl_conn_close(ic);
1080		return (error);
1081	}
1082
1083	/*
1084	 * Register socket upcall, to get notified about incoming PDUs
1085	 * and free space to send outgoing ones.
1086	 */
1087	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1088	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1089	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1090	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1091	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1092	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1093
1094	return (0);
1095}
1096
1097int
1098icl_conn_handoff(struct icl_conn *ic, int fd)
1099{
1100	struct file *fp;
1101	struct socket *so;
1102	cap_rights_t rights;
1103	int error;
1104
1105	/*
1106	 * Steal the socket from userland.
1107	 */
1108	error = fget(curthread, fd,
1109	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1110	if (error != 0)
1111		return (error);
1112	if (fp->f_type != DTYPE_SOCKET) {
1113		fdrop(fp, curthread);
1114		return (EINVAL);
1115	}
1116	so = fp->f_data;
1117	if (so->so_type != SOCK_STREAM) {
1118		fdrop(fp, curthread);
1119		return (EINVAL);
1120	}
1121
1122	ICL_CONN_LOCK(ic);
1123
1124	if (ic->ic_socket != NULL) {
1125		ICL_CONN_UNLOCK(ic);
1126		fdrop(fp, curthread);
1127		return (EBUSY);
1128	}
1129
1130	ic->ic_socket = fp->f_data;
1131	fp->f_ops = &badfileops;
1132	fp->f_data = NULL;
1133	fdrop(fp, curthread);
1134	ICL_CONN_UNLOCK(ic);
1135
1136	error = icl_conn_start(ic);
1137
1138	return (error);
1139}
1140
1141void
1142icl_conn_shutdown(struct icl_conn *ic)
1143{
1144
1145	ICL_CONN_LOCK(ic);
1146	if (ic->ic_socket == NULL) {
1147		ICL_CONN_UNLOCK(ic);
1148		return;
1149	}
1150	ICL_CONN_UNLOCK(ic);
1151
1152	soshutdown(ic->ic_socket, SHUT_RDWR);
1153}
1154
1155void
1156icl_conn_close(struct icl_conn *ic)
1157{
1158	struct icl_pdu *pdu;
1159
1160	ICL_CONN_LOCK(ic);
1161	if (ic->ic_socket == NULL) {
1162		ICL_CONN_UNLOCK(ic);
1163		return;
1164	}
1165
1166	ic->ic_disconnecting = true;
1167
1168	/*
1169	 * Wake up the threads, so they can properly terminate.
1170	 */
1171	cv_signal(&ic->ic_receive_cv);
1172	cv_signal(&ic->ic_send_cv);
1173	while (ic->ic_receive_running || ic->ic_send_running) {
1174		//ICL_DEBUG("waiting for send/receive threads to terminate");
1175		ICL_CONN_UNLOCK(ic);
1176		cv_signal(&ic->ic_receive_cv);
1177		cv_signal(&ic->ic_send_cv);
1178		pause("icl_close", 1 * hz);
1179		ICL_CONN_LOCK(ic);
1180	}
1181	//ICL_DEBUG("send/receive threads terminated");
1182
1183	soclose(ic->ic_socket);
1184	ic->ic_socket = NULL;
1185
1186	if (ic->ic_receive_pdu != NULL) {
1187		//ICL_DEBUG("freeing partially received PDU");
1188		icl_pdu_free(ic->ic_receive_pdu);
1189		ic->ic_receive_pdu = NULL;
1190	}
1191
1192	/*
1193	 * Remove any outstanding PDUs from the send queue.
1194	 */
1195	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1196		pdu = TAILQ_FIRST(&ic->ic_to_send);
1197		TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1198		icl_pdu_free(pdu);
1199	}
1200
1201	KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1202	    ("destroying session with non-empty send queue"));
1203	/*
1204	 * XXX
1205	 */
1206#if 0
1207	KASSERT(ic->ic_outstanding_pdus == 0,
1208	    ("destroying session with %d outstanding PDUs",
1209	     ic->ic_outstanding_pdus));
1210#endif
1211	ICL_CONN_UNLOCK(ic);
1212}
1213
1214bool
1215icl_conn_connected(struct icl_conn *ic)
1216{
1217
1218	ICL_CONN_LOCK(ic);
1219	if (ic->ic_socket == NULL) {
1220		ICL_CONN_UNLOCK(ic);
1221		return (false);
1222	}
1223	if (ic->ic_socket->so_error != 0) {
1224		ICL_CONN_UNLOCK(ic);
1225		return (false);
1226	}
1227	ICL_CONN_UNLOCK(ic);
1228	return (true);
1229}
1230
1231#ifdef ICL_KERNEL_PROXY
1232int
1233icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1234{
1235	int error;
1236
1237	if (so->so_type != SOCK_STREAM)
1238		return (EINVAL);
1239
1240	ICL_CONN_LOCK(ic);
1241	if (ic->ic_socket != NULL) {
1242		ICL_CONN_UNLOCK(ic);
1243		return (EBUSY);
1244	}
1245	ic->ic_socket = so;
1246	ICL_CONN_UNLOCK(ic);
1247
1248	error = icl_conn_start(ic);
1249
1250	return (error);
1251}
1252#endif /* ICL_KERNEL_PROXY */
1253
1254static int
1255icl_unload(void)
1256{
1257
1258	if (icl_ncons != 0)
1259		return (EBUSY);
1260
1261	uma_zdestroy(icl_conn_zone);
1262	uma_zdestroy(icl_pdu_zone);
1263
1264	return (0);
1265}
1266
1267static void
1268icl_load(void)
1269{
1270
1271	icl_conn_zone = uma_zcreate("icl_conn",
1272	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1273	    UMA_ALIGN_PTR, 0);
1274	icl_pdu_zone = uma_zcreate("icl_pdu",
1275	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1276	    UMA_ALIGN_PTR, 0);
1277
1278	refcount_init(&icl_ncons, 0);
1279}
1280
1281static int
1282icl_modevent(module_t mod, int what, void *arg)
1283{
1284
1285	switch (what) {
1286	case MOD_LOAD:
1287		icl_load();
1288		return (0);
1289	case MOD_UNLOAD:
1290		return (icl_unload());
1291	default:
1292		return (EINVAL);
1293	}
1294}
1295
1296moduledata_t icl_data = {
1297	"icl",
1298	icl_modevent,
1299	0
1300};
1301
1302DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1303MODULE_VERSION(icl, 1);
1304