icl_soft.c revision 274033
1/*-
2 * Copyright (c) 2012 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 */
30
31/*
32 * iSCSI Common Layer.  It's used by both the initiator and target to send
33 * and receive iSCSI PDUs.
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: head/sys/dev/iscsi/icl.c 274033 2014-11-03 11:05:23Z trasz $");
38
39#include <sys/param.h>
40#include <sys/capsicum.h>
41#include <sys/condvar.h>
42#include <sys/conf.h>
43#include <sys/file.h>
44#include <sys/kernel.h>
45#include <sys/kthread.h>
46#include <sys/lock.h>
47#include <sys/mbuf.h>
48#include <sys/mutex.h>
49#include <sys/module.h>
50#include <sys/protosw.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/sysctl.h>
54#include <sys/systm.h>
55#include <sys/sx.h>
56#include <sys/uio.h>
57#include <vm/uma.h>
58#include <netinet/in.h>
59#include <netinet/tcp.h>
60
61#include <dev/iscsi/icl.h>
62#include <dev/iscsi/iscsi_proto.h>
63
64SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
65static int debug = 1;
66SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN,
67    &debug, 0, "Enable debug messages");
68static int coalesce = 1;
69SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
70    &coalesce, 0, "Try to coalesce PDUs before sending");
71static int partial_receive_len = 128 * 1024;
72SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
73    &partial_receive_len, 0, "Minimum read size for partially received "
74    "data segment");
75static int sendspace = 1048576;
76SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
77    &sendspace, 0, "Default send socket buffer size");
78static int recvspace = 1048576;
79SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
80    &recvspace, 0, "Default receive socket buffer size");
81
82static uma_zone_t icl_conn_zone;
83static uma_zone_t icl_pdu_zone;
84
85static volatile u_int	icl_ncons;
86
87#define	ICL_DEBUG(X, ...)						\
88	do {								\
89		if (debug > 1)						\
90			printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
91	} while (0)
92
93#define	ICL_WARN(X, ...)						\
94	do {								\
95		if (debug > 0) {					\
96			printf("WARNING: %s: " X "\n",			\
97			    __func__, ## __VA_ARGS__);			\
98		}							\
99	} while (0)
100
101#define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
102#define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
103#define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
104#define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
105
106STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
107
108static void
109icl_conn_fail(struct icl_conn *ic)
110{
111	if (ic->ic_socket == NULL)
112		return;
113
114	/*
115	 * XXX
116	 */
117	ic->ic_socket->so_error = EDOOFUS;
118	(ic->ic_error)(ic);
119}
120
121static struct mbuf *
122icl_conn_receive(struct icl_conn *ic, size_t len)
123{
124	struct uio uio;
125	struct socket *so;
126	struct mbuf *m;
127	int error, flags;
128
129	so = ic->ic_socket;
130
131	memset(&uio, 0, sizeof(uio));
132	uio.uio_resid = len;
133
134	flags = MSG_DONTWAIT;
135	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
136	if (error != 0) {
137		ICL_DEBUG("soreceive error %d", error);
138		return (NULL);
139	}
140	if (uio.uio_resid != 0) {
141		m_freem(m);
142		ICL_DEBUG("short read");
143		return (NULL);
144	}
145
146	return (m);
147}
148
149static struct icl_pdu *
150icl_pdu_new_empty(struct icl_conn *ic, int flags)
151{
152	struct icl_pdu *ip;
153
154#ifdef DIAGNOSTIC
155	refcount_acquire(&ic->ic_outstanding_pdus);
156#endif
157	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
158	if (ip == NULL) {
159		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
160#ifdef DIAGNOSTIC
161		refcount_release(&ic->ic_outstanding_pdus);
162#endif
163		return (NULL);
164	}
165
166	ip->ip_conn = ic;
167
168	return (ip);
169}
170
171void
172icl_pdu_free(struct icl_pdu *ip)
173{
174	struct icl_conn *ic;
175
176	ic = ip->ip_conn;
177
178	m_freem(ip->ip_bhs_mbuf);
179	m_freem(ip->ip_ahs_mbuf);
180	m_freem(ip->ip_data_mbuf);
181	uma_zfree(icl_pdu_zone, ip);
182#ifdef DIAGNOSTIC
183	refcount_release(&ic->ic_outstanding_pdus);
184#endif
185}
186
187/*
188 * Allocate icl_pdu with empty BHS to fill up by the caller.
189 */
190struct icl_pdu *
191icl_pdu_new_bhs(struct icl_conn *ic, int flags)
192{
193	struct icl_pdu *ip;
194
195	ip = icl_pdu_new_empty(ic, flags);
196	if (ip == NULL)
197		return (NULL);
198
199	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
200	    flags, MT_DATA, M_PKTHDR);
201	if (ip->ip_bhs_mbuf == NULL) {
202		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
203		icl_pdu_free(ip);
204		return (NULL);
205	}
206	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
207	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
208	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
209
210	return (ip);
211}
212
213static int
214icl_pdu_ahs_length(const struct icl_pdu *request)
215{
216
217	return (request->ip_bhs->bhs_total_ahs_len * 4);
218}
219
220size_t
221icl_pdu_data_segment_length(const struct icl_pdu *request)
222{
223	uint32_t len = 0;
224
225	len += request->ip_bhs->bhs_data_segment_len[0];
226	len <<= 8;
227	len += request->ip_bhs->bhs_data_segment_len[1];
228	len <<= 8;
229	len += request->ip_bhs->bhs_data_segment_len[2];
230
231	return (len);
232}
233
234static void
235icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
236{
237
238	response->ip_bhs->bhs_data_segment_len[2] = len;
239	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
240	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
241}
242
243static size_t
244icl_pdu_padding(const struct icl_pdu *ip)
245{
246
247	if ((ip->ip_data_len % 4) != 0)
248		return (4 - (ip->ip_data_len % 4));
249
250	return (0);
251}
252
253static size_t
254icl_pdu_size(const struct icl_pdu *response)
255{
256	size_t len;
257
258	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
259
260	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
261	    icl_pdu_padding(response);
262	if (response->ip_conn->ic_header_crc32c)
263		len += ISCSI_HEADER_DIGEST_SIZE;
264	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
265		len += ISCSI_DATA_DIGEST_SIZE;
266
267	return (len);
268}
269
270static int
271icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
272{
273	struct mbuf *m;
274
275	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
276	if (m == NULL) {
277		ICL_DEBUG("failed to receive BHS");
278		return (-1);
279	}
280
281	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
282	if (request->ip_bhs_mbuf == NULL) {
283		ICL_WARN("m_pullup failed");
284		return (-1);
285	}
286	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
287
288	/*
289	 * XXX: For architectures with strict alignment requirements
290	 * 	we may need to allocate ip_bhs and copy the data into it.
291	 * 	For some reason, though, not doing this doesn't seem
292	 * 	to cause problems; tested on sparc64.
293	 */
294
295	*availablep -= sizeof(struct iscsi_bhs);
296	return (0);
297}
298
299static int
300icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
301{
302
303	request->ip_ahs_len = icl_pdu_ahs_length(request);
304	if (request->ip_ahs_len == 0)
305		return (0);
306
307	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
308	    request->ip_ahs_len);
309	if (request->ip_ahs_mbuf == NULL) {
310		ICL_DEBUG("failed to receive AHS");
311		return (-1);
312	}
313
314	*availablep -= request->ip_ahs_len;
315	return (0);
316}
317
318static uint32_t
319icl_mbuf_to_crc32c(const struct mbuf *m0)
320{
321	uint32_t digest = 0xffffffff;
322	const struct mbuf *m;
323
324	for (m = m0; m != NULL; m = m->m_next)
325		digest = calculate_crc32c(digest,
326		    mtod(m, const void *), m->m_len);
327
328	digest = digest ^ 0xffffffff;
329
330	return (digest);
331}
332
333static int
334icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
335{
336	struct mbuf *m;
337	uint32_t received_digest, valid_digest;
338
339	if (request->ip_conn->ic_header_crc32c == false)
340		return (0);
341
342	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
343	if (m == NULL) {
344		ICL_DEBUG("failed to receive header digest");
345		return (-1);
346	}
347
348	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
349	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
350	m_freem(m);
351
352	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
353
354	/*
355	 * XXX: Handle AHS.
356	 */
357	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
358	if (received_digest != valid_digest) {
359		ICL_WARN("header digest check failed; got 0x%x, "
360		    "should be 0x%x", received_digest, valid_digest);
361		return (-1);
362	}
363
364	return (0);
365}
366
367/*
368 * Return the number of bytes that should be waiting in the receive socket
369 * before icl_pdu_receive_data_segment() gets called.
370 */
371static size_t
372icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
373{
374	size_t len;
375
376	len = icl_pdu_data_segment_length(request);
377	if (len == 0)
378		return (0);
379
380	/*
381	 * Account for the parts of data segment already read from
382	 * the socket buffer.
383	 */
384	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
385	len -= request->ip_data_len;
386
387	/*
388	 * Don't always wait for the full data segment to be delivered
389	 * to the socket; this might badly affect performance due to
390	 * TCP window scaling.
391	 */
392	if (len > partial_receive_len) {
393#if 0
394		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
395		    len, partial_receive_len));
396#endif
397		len = partial_receive_len;
398
399		return (len);
400	}
401
402	/*
403	 * Account for padding.  Note that due to the way code is written,
404	 * the icl_pdu_receive_data_segment() must always receive padding
405	 * along with the last part of data segment, because it would be
406	 * impossible to tell whether we've already received the full data
407	 * segment including padding, or without it.
408	 */
409	if ((len % 4) != 0)
410		len += 4 - (len % 4);
411
412#if 0
413	ICL_DEBUG("need %zd bytes of data", len));
414#endif
415
416	return (len);
417}
418
419static int
420icl_pdu_receive_data_segment(struct icl_pdu *request,
421    size_t *availablep, bool *more_neededp)
422{
423	struct icl_conn *ic;
424	size_t len, padding = 0;
425	struct mbuf *m;
426
427	ic = request->ip_conn;
428
429	*more_neededp = false;
430	ic->ic_receive_len = 0;
431
432	len = icl_pdu_data_segment_length(request);
433	if (len == 0)
434		return (0);
435
436	if ((len % 4) != 0)
437		padding = 4 - (len % 4);
438
439	/*
440	 * Account for already received parts of data segment.
441	 */
442	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
443	len -= request->ip_data_len;
444
445	if (len + padding > *availablep) {
446		/*
447		 * Not enough data in the socket buffer.  Receive as much
448		 * as we can.  Don't receive padding, since, obviously, it's
449		 * not the end of data segment yet.
450		 */
451#if 0
452		ICL_DEBUG("limited from %zd to %zd",
453		    len + padding, *availablep - padding));
454#endif
455		len = *availablep - padding;
456		*more_neededp = true;
457		padding = 0;
458	}
459
460	/*
461	 * Must not try to receive padding without at least one byte
462	 * of actual data segment.
463	 */
464	if (len > 0) {
465		m = icl_conn_receive(request->ip_conn, len + padding);
466		if (m == NULL) {
467			ICL_DEBUG("failed to receive data segment");
468			return (-1);
469		}
470
471		if (request->ip_data_mbuf == NULL)
472			request->ip_data_mbuf = m;
473		else
474			m_cat(request->ip_data_mbuf, m);
475
476		request->ip_data_len += len;
477		*availablep -= len + padding;
478	} else
479		ICL_DEBUG("len 0");
480
481	if (*more_neededp)
482		ic->ic_receive_len =
483		    icl_pdu_data_segment_receive_len(request);
484
485	return (0);
486}
487
488static int
489icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
490{
491	struct mbuf *m;
492	uint32_t received_digest, valid_digest;
493
494	if (request->ip_conn->ic_data_crc32c == false)
495		return (0);
496
497	if (request->ip_data_len == 0)
498		return (0);
499
500	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
501	if (m == NULL) {
502		ICL_DEBUG("failed to receive data digest");
503		return (-1);
504	}
505
506	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
507	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
508	m_freem(m);
509
510	*availablep -= ISCSI_DATA_DIGEST_SIZE;
511
512	/*
513	 * Note that ip_data_mbuf also contains padding; since digest
514	 * calculation is supposed to include that, we iterate over
515	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
516	 */
517	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
518	if (received_digest != valid_digest) {
519		ICL_WARN("data digest check failed; got 0x%x, "
520		    "should be 0x%x", received_digest, valid_digest);
521		return (-1);
522	}
523
524	return (0);
525}
526
527/*
528 * Somewhat contrary to the name, this attempts to receive only one
529 * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
530 */
531static struct icl_pdu *
532icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
533{
534	struct icl_pdu *request;
535	struct socket *so;
536	size_t len;
537	int error;
538	bool more_needed;
539
540	so = ic->ic_socket;
541
542	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
543		KASSERT(ic->ic_receive_pdu == NULL,
544		    ("ic->ic_receive_pdu != NULL"));
545		request = icl_pdu_new_empty(ic, M_NOWAIT);
546		if (request == NULL) {
547			ICL_DEBUG("failed to allocate PDU; "
548			    "dropping connection");
549			icl_conn_fail(ic);
550			return (NULL);
551		}
552		ic->ic_receive_pdu = request;
553	} else {
554		KASSERT(ic->ic_receive_pdu != NULL,
555		    ("ic->ic_receive_pdu == NULL"));
556		request = ic->ic_receive_pdu;
557	}
558
559	if (*availablep < ic->ic_receive_len) {
560#if 0
561		ICL_DEBUG("not enough data; need %zd, "
562		    "have %zd", ic->ic_receive_len, *availablep);
563#endif
564		return (NULL);
565	}
566
567	switch (ic->ic_receive_state) {
568	case ICL_CONN_STATE_BHS:
569		//ICL_DEBUG("receiving BHS");
570		error = icl_pdu_receive_bhs(request, availablep);
571		if (error != 0) {
572			ICL_DEBUG("failed to receive BHS; "
573			    "dropping connection");
574			break;
575		}
576
577		/*
578		 * We don't enforce any limit for AHS length;
579		 * its length is stored in 8 bit field.
580		 */
581
582		len = icl_pdu_data_segment_length(request);
583		if (len > ic->ic_max_data_segment_length) {
584			ICL_WARN("received data segment "
585			    "length %zd is larger than negotiated "
586			    "MaxDataSegmentLength %zd; "
587			    "dropping connection",
588			    len, ic->ic_max_data_segment_length);
589			error = EINVAL;
590			break;
591		}
592
593		ic->ic_receive_state = ICL_CONN_STATE_AHS;
594		ic->ic_receive_len = icl_pdu_ahs_length(request);
595		break;
596
597	case ICL_CONN_STATE_AHS:
598		//ICL_DEBUG("receiving AHS");
599		error = icl_pdu_receive_ahs(request, availablep);
600		if (error != 0) {
601			ICL_DEBUG("failed to receive AHS; "
602			    "dropping connection");
603			break;
604		}
605		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
606		if (ic->ic_header_crc32c == false)
607			ic->ic_receive_len = 0;
608		else
609			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
610		break;
611
612	case ICL_CONN_STATE_HEADER_DIGEST:
613		//ICL_DEBUG("receiving header digest");
614		error = icl_pdu_check_header_digest(request, availablep);
615		if (error != 0) {
616			ICL_DEBUG("header digest failed; "
617			    "dropping connection");
618			break;
619		}
620
621		ic->ic_receive_state = ICL_CONN_STATE_DATA;
622		ic->ic_receive_len =
623		    icl_pdu_data_segment_receive_len(request);
624		break;
625
626	case ICL_CONN_STATE_DATA:
627		//ICL_DEBUG("receiving data segment");
628		error = icl_pdu_receive_data_segment(request, availablep,
629		    &more_needed);
630		if (error != 0) {
631			ICL_DEBUG("failed to receive data segment;"
632			    "dropping connection");
633			break;
634		}
635
636		if (more_needed)
637			break;
638
639		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
640		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
641			ic->ic_receive_len = 0;
642		else
643			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
644		break;
645
646	case ICL_CONN_STATE_DATA_DIGEST:
647		//ICL_DEBUG("receiving data digest");
648		error = icl_pdu_check_data_digest(request, availablep);
649		if (error != 0) {
650			ICL_DEBUG("data digest failed; "
651			    "dropping connection");
652			break;
653		}
654
655		/*
656		 * We've received complete PDU; reset the receive state machine
657		 * and return the PDU.
658		 */
659		ic->ic_receive_state = ICL_CONN_STATE_BHS;
660		ic->ic_receive_len = sizeof(struct iscsi_bhs);
661		ic->ic_receive_pdu = NULL;
662		return (request);
663
664	default:
665		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
666	}
667
668	if (error != 0) {
669		/*
670		 * Don't free the PDU; it's pointed to by ic->ic_receive_pdu
671		 * and will get freed in icl_conn_close().
672		 */
673		icl_conn_fail(ic);
674	}
675
676	return (NULL);
677}
678
679static void
680icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
681{
682	struct icl_pdu *response;
683	struct socket *so;
684
685	so = ic->ic_socket;
686
687	/*
688	 * This can never happen; we're careful to only mess with ic->ic_socket
689	 * pointer when the send/receive threads are not running.
690	 */
691	KASSERT(so != NULL, ("NULL socket"));
692
693	for (;;) {
694		if (ic->ic_disconnecting)
695			return;
696
697		if (so->so_error != 0) {
698			ICL_DEBUG("connection error %d; "
699			    "dropping connection", so->so_error);
700			icl_conn_fail(ic);
701			return;
702		}
703
704		/*
705		 * Loop until we have a complete PDU or there is not enough
706		 * data in the socket buffer.
707		 */
708		if (available < ic->ic_receive_len) {
709#if 0
710			ICL_DEBUG("not enough data; have %zd, "
711			    "need %zd", available,
712			    ic->ic_receive_len);
713#endif
714			return;
715		}
716
717		response = icl_conn_receive_pdu(ic, &available);
718		if (response == NULL)
719			continue;
720
721		if (response->ip_ahs_len > 0) {
722			ICL_WARN("received PDU with unsupported "
723			    "AHS; opcode 0x%x; dropping connection",
724			    response->ip_bhs->bhs_opcode);
725			icl_pdu_free(response);
726			icl_conn_fail(ic);
727			return;
728		}
729
730		(ic->ic_receive)(response);
731	}
732}
733
734static void
735icl_receive_thread(void *arg)
736{
737	struct icl_conn *ic;
738	size_t available;
739	struct socket *so;
740
741	ic = arg;
742	so = ic->ic_socket;
743
744	ICL_CONN_LOCK(ic);
745	ic->ic_receive_running = true;
746	ICL_CONN_UNLOCK(ic);
747
748	for (;;) {
749		if (ic->ic_disconnecting) {
750			//ICL_DEBUG("terminating");
751			break;
752		}
753
754		/*
755		 * Set the low watermark, to be checked by
756		 * soreadable() in icl_soupcall_receive()
757		 * to avoid unneccessary wakeups until there
758		 * is enough data received to read the PDU.
759		 */
760		SOCKBUF_LOCK(&so->so_rcv);
761		available = so->so_rcv.sb_cc;
762		if (available < ic->ic_receive_len) {
763			so->so_rcv.sb_lowat = ic->ic_receive_len;
764			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
765		} else
766			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
767		SOCKBUF_UNLOCK(&so->so_rcv);
768
769		icl_conn_receive_pdus(ic, available);
770	}
771
772	ICL_CONN_LOCK(ic);
773	ic->ic_receive_running = false;
774	cv_signal(&ic->ic_send_cv);
775	ICL_CONN_UNLOCK(ic);
776	kthread_exit();
777}
778
779static int
780icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
781{
782	struct icl_conn *ic;
783
784	if (!soreadable(so))
785		return (SU_OK);
786
787	ic = arg;
788	cv_signal(&ic->ic_receive_cv);
789	return (SU_OK);
790}
791
792static int
793icl_pdu_finalize(struct icl_pdu *request)
794{
795	size_t padding, pdu_len;
796	uint32_t digest, zero = 0;
797	int ok;
798	struct icl_conn *ic;
799
800	ic = request->ip_conn;
801
802	icl_pdu_set_data_segment_length(request, request->ip_data_len);
803
804	pdu_len = icl_pdu_size(request);
805
806	if (ic->ic_header_crc32c) {
807		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
808		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
809		    (void *)&digest);
810		if (ok != 1) {
811			ICL_WARN("failed to append header digest");
812			return (1);
813		}
814	}
815
816	if (request->ip_data_len != 0) {
817		padding = icl_pdu_padding(request);
818		if (padding > 0) {
819			ok = m_append(request->ip_data_mbuf, padding,
820			    (void *)&zero);
821			if (ok != 1) {
822				ICL_WARN("failed to append padding");
823				return (1);
824			}
825		}
826
827		if (ic->ic_data_crc32c) {
828			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
829
830			ok = m_append(request->ip_data_mbuf, sizeof(digest),
831			    (void *)&digest);
832			if (ok != 1) {
833				ICL_WARN("failed to append data digest");
834				return (1);
835			}
836		}
837
838		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
839		request->ip_data_mbuf = NULL;
840	}
841
842	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
843
844	return (0);
845}
846
847static void
848icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
849{
850	struct icl_pdu *request, *request2;
851	struct socket *so;
852	size_t available, size, size2;
853	int coalesced, error;
854
855	ICL_CONN_LOCK_ASSERT_NOT(ic);
856
857	so = ic->ic_socket;
858
859	SOCKBUF_LOCK(&so->so_snd);
860	/*
861	 * Check how much space do we have for transmit.  We can't just
862	 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
863	 * as it always frees the mbuf chain passed to it, even in case
864	 * of error.
865	 */
866	available = sbspace(&so->so_snd);
867
868	/*
869	 * Notify the socket upcall that we don't need wakeups
870	 * for the time being.
871	 */
872	so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
873	SOCKBUF_UNLOCK(&so->so_snd);
874
875	while (!STAILQ_EMPTY(queue)) {
876		request = STAILQ_FIRST(queue);
877		size = icl_pdu_size(request);
878		if (available < size) {
879
880			/*
881			 * Set the low watermark, to be checked by
882			 * sowriteable() in icl_soupcall_send()
883			 * to avoid unneccessary wakeups until there
884			 * is enough space for the PDU to fit.
885			 */
886			SOCKBUF_LOCK(&so->so_snd);
887			available = sbspace(&so->so_snd);
888			if (available < size) {
889#if 1
890				ICL_DEBUG("no space to send; "
891				    "have %zd, need %zd",
892				    available, size);
893#endif
894				so->so_snd.sb_lowat = size;
895				SOCKBUF_UNLOCK(&so->so_snd);
896				return;
897			}
898			SOCKBUF_UNLOCK(&so->so_snd);
899		}
900		STAILQ_REMOVE_HEAD(queue, ip_next);
901		error = icl_pdu_finalize(request);
902		if (error != 0) {
903			ICL_DEBUG("failed to finalize PDU; "
904			    "dropping connection");
905			icl_conn_fail(ic);
906			icl_pdu_free(request);
907			return;
908		}
909		if (coalesce) {
910			coalesced = 1;
911			for (;;) {
912				request2 = STAILQ_FIRST(queue);
913				if (request2 == NULL)
914					break;
915				size2 = icl_pdu_size(request2);
916				if (available < size + size2)
917					break;
918				STAILQ_REMOVE_HEAD(queue, ip_next);
919				error = icl_pdu_finalize(request2);
920				if (error != 0) {
921					ICL_DEBUG("failed to finalize PDU; "
922					    "dropping connection");
923					icl_conn_fail(ic);
924					icl_pdu_free(request);
925					icl_pdu_free(request2);
926					return;
927				}
928				m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf);
929				request2->ip_bhs_mbuf = NULL;
930				request->ip_bhs_mbuf->m_pkthdr.len += size2;
931				size += size2;
932				STAILQ_REMOVE_AFTER(queue, request, ip_next);
933				icl_pdu_free(request2);
934				coalesced++;
935			}
936#if 0
937			if (coalesced > 1) {
938				ICL_DEBUG("coalesced %d PDUs into %zd bytes",
939				    coalesced, size);
940			}
941#endif
942		}
943		available -= size;
944		error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
945		    NULL, MSG_DONTWAIT, curthread);
946		request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
947		if (error != 0) {
948			ICL_DEBUG("failed to send PDU, error %d; "
949			    "dropping connection", error);
950			icl_conn_fail(ic);
951			icl_pdu_free(request);
952			return;
953		}
954		icl_pdu_free(request);
955	}
956}
957
958static void
959icl_send_thread(void *arg)
960{
961	struct icl_conn *ic;
962	struct icl_pdu_stailq queue;
963
964	ic = arg;
965
966	STAILQ_INIT(&queue);
967
968	ICL_CONN_LOCK(ic);
969	ic->ic_send_running = true;
970
971	for (;;) {
972		for (;;) {
973			/*
974			 * If the local queue is empty, populate it from
975			 * the main one.  This way the icl_conn_send_pdus()
976			 * can go through all the queued PDUs without holding
977			 * any locks.
978			 */
979			if (STAILQ_EMPTY(&queue))
980				STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu);
981
982			ic->ic_check_send_space = false;
983			ICL_CONN_UNLOCK(ic);
984			icl_conn_send_pdus(ic, &queue);
985			ICL_CONN_LOCK(ic);
986
987			/*
988			 * The icl_soupcall_send() was called since the last
989			 * call to sbspace(); go around;
990			 */
991			if (ic->ic_check_send_space)
992				continue;
993
994			/*
995			 * Local queue is empty, but we still have PDUs
996			 * in the main one; go around.
997			 */
998			if (STAILQ_EMPTY(&queue) &&
999			    !STAILQ_EMPTY(&ic->ic_to_send))
1000				continue;
1001
1002			/*
1003			 * There might be some stuff in the local queue,
1004			 * which didn't get sent due to not having enough send
1005			 * space.  Wait for socket upcall.
1006			 */
1007			break;
1008		}
1009
1010		if (ic->ic_disconnecting) {
1011			//ICL_DEBUG("terminating");
1012			break;
1013		}
1014
1015		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1016	}
1017
1018	/*
1019	 * We're exiting; move PDUs back to the main queue, so they can
1020	 * get freed properly.  At this point ordering doesn't matter.
1021	 */
1022	STAILQ_CONCAT(&ic->ic_to_send, &queue);
1023
1024	ic->ic_send_running = false;
1025	cv_signal(&ic->ic_send_cv);
1026	ICL_CONN_UNLOCK(ic);
1027	kthread_exit();
1028}
1029
1030static int
1031icl_soupcall_send(struct socket *so, void *arg, int waitflag)
1032{
1033	struct icl_conn *ic;
1034
1035	if (!sowriteable(so))
1036		return (SU_OK);
1037
1038	ic = arg;
1039
1040	ICL_CONN_LOCK(ic);
1041	ic->ic_check_send_space = true;
1042	ICL_CONN_UNLOCK(ic);
1043
1044	cv_signal(&ic->ic_send_cv);
1045
1046	return (SU_OK);
1047}
1048
1049int
1050icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len,
1051    int flags)
1052{
1053	struct mbuf *mb, *newmb;
1054	size_t copylen, off = 0;
1055
1056	KASSERT(len > 0, ("len == 0"));
1057
1058	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
1059	if (newmb == NULL) {
1060		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
1061		return (ENOMEM);
1062	}
1063
1064	for (mb = newmb; mb != NULL; mb = mb->m_next) {
1065		copylen = min(M_TRAILINGSPACE(mb), len - off);
1066		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
1067		mb->m_len = copylen;
1068		off += copylen;
1069	}
1070	KASSERT(off == len, ("%s: off != len", __func__));
1071
1072	if (request->ip_data_mbuf == NULL) {
1073		request->ip_data_mbuf = newmb;
1074		request->ip_data_len = len;
1075	} else {
1076		m_cat(request->ip_data_mbuf, newmb);
1077		request->ip_data_len += len;
1078	}
1079
1080	return (0);
1081}
1082
1083void
1084icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
1085{
1086
1087	m_copydata(ip->ip_data_mbuf, off, len, addr);
1088}
1089
1090void
1091icl_pdu_queue(struct icl_pdu *ip)
1092{
1093	struct icl_conn *ic;
1094
1095	ic = ip->ip_conn;
1096
1097	ICL_CONN_LOCK_ASSERT(ic);
1098
1099	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
1100		ICL_DEBUG("icl_pdu_queue on closed connection");
1101		icl_pdu_free(ip);
1102		return;
1103	}
1104
1105	if (!STAILQ_EMPTY(&ic->ic_to_send)) {
1106		STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1107		/*
1108		 * If the queue is not empty, someone else had already
1109		 * signaled the send thread; no need to do that again,
1110		 * just return.
1111		 */
1112		return;
1113	}
1114
1115	STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1116	cv_signal(&ic->ic_send_cv);
1117}
1118
1119struct icl_conn *
1120icl_conn_new(const char *name, struct mtx *lock)
1121{
1122	struct icl_conn *ic;
1123
1124	refcount_acquire(&icl_ncons);
1125
1126	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
1127
1128	STAILQ_INIT(&ic->ic_to_send);
1129	ic->ic_lock = lock;
1130	cv_init(&ic->ic_send_cv, "icl_tx");
1131	cv_init(&ic->ic_receive_cv, "icl_rx");
1132#ifdef DIAGNOSTIC
1133	refcount_init(&ic->ic_outstanding_pdus, 0);
1134#endif
1135	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
1136	ic->ic_name = name;
1137
1138	return (ic);
1139}
1140
1141void
1142icl_conn_free(struct icl_conn *ic)
1143{
1144
1145	cv_destroy(&ic->ic_send_cv);
1146	cv_destroy(&ic->ic_receive_cv);
1147	uma_zfree(icl_conn_zone, ic);
1148	refcount_release(&icl_ncons);
1149}
1150
1151static int
1152icl_conn_start(struct icl_conn *ic)
1153{
1154	size_t minspace;
1155	struct sockopt opt;
1156	int error, one = 1;
1157
1158	ICL_CONN_LOCK(ic);
1159
1160	/*
1161	 * XXX: Ugly hack.
1162	 */
1163	if (ic->ic_socket == NULL) {
1164		ICL_CONN_UNLOCK(ic);
1165		return (EINVAL);
1166	}
1167
1168	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1169	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1170	ic->ic_disconnecting = false;
1171
1172	ICL_CONN_UNLOCK(ic);
1173
1174	/*
1175	 * For sendspace, this is required because the current code cannot
1176	 * send a PDU in pieces; thus, the minimum buffer size is equal
1177	 * to the maximum PDU size.  "+4" is to account for possible padding.
1178	 *
1179	 * What we should actually do here is to use autoscaling, but set
1180	 * some minimal buffer size to "minspace".  I don't know a way to do
1181	 * that, though.
1182	 */
1183	minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1184	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1185	if (sendspace < minspace) {
1186		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1187		    minspace);
1188		sendspace = minspace;
1189	}
1190	if (recvspace < minspace) {
1191		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1192		    minspace);
1193		recvspace = minspace;
1194	}
1195
1196	error = soreserve(ic->ic_socket, sendspace, recvspace);
1197	if (error != 0) {
1198		ICL_WARN("soreserve failed with error %d", error);
1199		icl_conn_close(ic);
1200		return (error);
1201	}
1202
1203	/*
1204	 * Disable Nagle.
1205	 */
1206	bzero(&opt, sizeof(opt));
1207	opt.sopt_dir = SOPT_SET;
1208	opt.sopt_level = IPPROTO_TCP;
1209	opt.sopt_name = TCP_NODELAY;
1210	opt.sopt_val = &one;
1211	opt.sopt_valsize = sizeof(one);
1212	error = sosetopt(ic->ic_socket, &opt);
1213	if (error != 0) {
1214		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1215		icl_conn_close(ic);
1216		return (error);
1217	}
1218
1219	/*
1220	 * Start threads.
1221	 */
1222	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1223	    ic->ic_name);
1224	if (error != 0) {
1225		ICL_WARN("kthread_add(9) failed with error %d", error);
1226		icl_conn_close(ic);
1227		return (error);
1228	}
1229
1230	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1231	    ic->ic_name);
1232	if (error != 0) {
1233		ICL_WARN("kthread_add(9) failed with error %d", error);
1234		icl_conn_close(ic);
1235		return (error);
1236	}
1237
1238	/*
1239	 * Register socket upcall, to get notified about incoming PDUs
1240	 * and free space to send outgoing ones.
1241	 */
1242	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1243	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1244	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1245	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1246	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1247	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1248
1249	return (0);
1250}
1251
1252int
1253icl_conn_handoff(struct icl_conn *ic, int fd)
1254{
1255	struct file *fp;
1256	struct socket *so;
1257	cap_rights_t rights;
1258	int error;
1259
1260	ICL_CONN_LOCK_ASSERT_NOT(ic);
1261
1262	/*
1263	 * Steal the socket from userland.
1264	 */
1265	error = fget(curthread, fd,
1266	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1267	if (error != 0)
1268		return (error);
1269	if (fp->f_type != DTYPE_SOCKET) {
1270		fdrop(fp, curthread);
1271		return (EINVAL);
1272	}
1273	so = fp->f_data;
1274	if (so->so_type != SOCK_STREAM) {
1275		fdrop(fp, curthread);
1276		return (EINVAL);
1277	}
1278
1279	ICL_CONN_LOCK(ic);
1280
1281	if (ic->ic_socket != NULL) {
1282		ICL_CONN_UNLOCK(ic);
1283		fdrop(fp, curthread);
1284		return (EBUSY);
1285	}
1286
1287	ic->ic_socket = fp->f_data;
1288	fp->f_ops = &badfileops;
1289	fp->f_data = NULL;
1290	fdrop(fp, curthread);
1291	ICL_CONN_UNLOCK(ic);
1292
1293	error = icl_conn_start(ic);
1294
1295	return (error);
1296}
1297
1298void
1299icl_conn_close(struct icl_conn *ic)
1300{
1301	struct icl_pdu *pdu;
1302
1303	ICL_CONN_LOCK_ASSERT_NOT(ic);
1304
1305	ICL_CONN_LOCK(ic);
1306	if (ic->ic_socket == NULL) {
1307		ICL_CONN_UNLOCK(ic);
1308		return;
1309	}
1310
1311	/*
1312	 * Deregister socket upcalls.
1313	 */
1314	ICL_CONN_UNLOCK(ic);
1315	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1316	if (ic->ic_socket->so_snd.sb_upcall != NULL)
1317		soupcall_clear(ic->ic_socket, SO_SND);
1318	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1319	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1320	if (ic->ic_socket->so_rcv.sb_upcall != NULL)
1321		soupcall_clear(ic->ic_socket, SO_RCV);
1322	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1323	ICL_CONN_LOCK(ic);
1324
1325	ic->ic_disconnecting = true;
1326
1327	/*
1328	 * Wake up the threads, so they can properly terminate.
1329	 */
1330	while (ic->ic_receive_running || ic->ic_send_running) {
1331		//ICL_DEBUG("waiting for send/receive threads to terminate");
1332		cv_signal(&ic->ic_receive_cv);
1333		cv_signal(&ic->ic_send_cv);
1334		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1335	}
1336	//ICL_DEBUG("send/receive threads terminated");
1337
1338	ICL_CONN_UNLOCK(ic);
1339	soclose(ic->ic_socket);
1340	ICL_CONN_LOCK(ic);
1341	ic->ic_socket = NULL;
1342
1343	if (ic->ic_receive_pdu != NULL) {
1344		//ICL_DEBUG("freeing partially received PDU");
1345		icl_pdu_free(ic->ic_receive_pdu);
1346		ic->ic_receive_pdu = NULL;
1347	}
1348
1349	/*
1350	 * Remove any outstanding PDUs from the send queue.
1351	 */
1352	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1353		pdu = STAILQ_FIRST(&ic->ic_to_send);
1354		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1355		icl_pdu_free(pdu);
1356	}
1357
1358	KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1359	    ("destroying session with non-empty send queue"));
1360#ifdef DIAGNOSTIC
1361	KASSERT(ic->ic_outstanding_pdus == 0,
1362	    ("destroying session with %d outstanding PDUs",
1363	     ic->ic_outstanding_pdus));
1364#endif
1365	ICL_CONN_UNLOCK(ic);
1366}
1367
1368bool
1369icl_conn_connected(struct icl_conn *ic)
1370{
1371	ICL_CONN_LOCK_ASSERT_NOT(ic);
1372
1373	ICL_CONN_LOCK(ic);
1374	if (ic->ic_socket == NULL) {
1375		ICL_CONN_UNLOCK(ic);
1376		return (false);
1377	}
1378	if (ic->ic_socket->so_error != 0) {
1379		ICL_CONN_UNLOCK(ic);
1380		return (false);
1381	}
1382	ICL_CONN_UNLOCK(ic);
1383	return (true);
1384}
1385
1386#ifdef ICL_KERNEL_PROXY
1387int
1388icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1389{
1390	int error;
1391
1392	ICL_CONN_LOCK_ASSERT_NOT(ic);
1393
1394	if (so->so_type != SOCK_STREAM)
1395		return (EINVAL);
1396
1397	ICL_CONN_LOCK(ic);
1398	if (ic->ic_socket != NULL) {
1399		ICL_CONN_UNLOCK(ic);
1400		return (EBUSY);
1401	}
1402	ic->ic_socket = so;
1403	ICL_CONN_UNLOCK(ic);
1404
1405	error = icl_conn_start(ic);
1406
1407	return (error);
1408}
1409#endif /* ICL_KERNEL_PROXY */
1410
1411static int
1412icl_unload(void)
1413{
1414
1415	if (icl_ncons != 0)
1416		return (EBUSY);
1417
1418	uma_zdestroy(icl_conn_zone);
1419	uma_zdestroy(icl_pdu_zone);
1420
1421	return (0);
1422}
1423
1424static void
1425icl_load(void)
1426{
1427
1428	icl_conn_zone = uma_zcreate("icl_conn",
1429	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1430	    UMA_ALIGN_PTR, 0);
1431	icl_pdu_zone = uma_zcreate("icl_pdu",
1432	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1433	    UMA_ALIGN_PTR, 0);
1434
1435	refcount_init(&icl_ncons, 0);
1436}
1437
1438static int
1439icl_modevent(module_t mod, int what, void *arg)
1440{
1441
1442	switch (what) {
1443	case MOD_LOAD:
1444		icl_load();
1445		return (0);
1446	case MOD_UNLOAD:
1447		return (icl_unload());
1448	default:
1449		return (EINVAL);
1450	}
1451}
1452
1453moduledata_t icl_data = {
1454	"icl",
1455	icl_modevent,
1456	0
1457};
1458
1459DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1460MODULE_VERSION(icl, 1);
1461