sdp_main.c revision 331769
1
2/*-
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4 *      The Regents of the University of California.  All rights reserved.
5 * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
6 * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33 */
34
35/*
36 *
37 * Copyright (c) 2010 Isilon Systems, Inc.
38 * Copyright (c) 2010 iX Systems, Inc.
39 * Copyright (c) 2010 Panasas, Inc.
40 * All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 *    notice unmodified, this list of conditions, and the following
47 *    disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 *    notice, this list of conditions and the following disclaimer in the
50 *    documentation and/or other materials provided with the distribution.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64#include <sys/cdefs.h>
65__FBSDID("$FreeBSD$");
66
67#include <sys/param.h>
68#include <sys/kernel.h>
69#include <sys/malloc.h>
70
71#include "sdp.h"
72
73#include <net/if.h>
74#include <net/route.h>
75#include <net/vnet.h>
76#include <sys/sysctl.h>
77
78uma_zone_t	sdp_zone;
79struct rwlock	sdp_lock;
80LIST_HEAD(, sdp_sock) sdp_list;
81
82struct workqueue_struct *rx_comp_wq;
83
84RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
85#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
86#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
87#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
88#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
89#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
90#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
91#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
92
93MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
94
95static void sdp_stop_keepalive_timer(struct socket *so);
96
97/*
98 * SDP protocol interface to socket abstraction.
99 */
100/*
101 * sdp_sendspace and sdp_recvspace are the default send and receive window
102 * sizes, respectively.
103 */
104u_long	sdp_sendspace = 1024*32;
105u_long	sdp_recvspace = 1024*64;
106
107static int sdp_count;
108
109/*
110 * Disable async. CMA events for sockets which are being torn down.
111 */
112static void
113sdp_destroy_cma(struct sdp_sock *ssk)
114{
115
116	if (ssk->id == NULL)
117		return;
118	rdma_destroy_id(ssk->id);
119	ssk->id = NULL;
120}
121
122static int
123sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
124{
125	struct sockaddr_in *sin;
126	struct sockaddr_in null;
127	int error;
128
129	SDP_WLOCK_ASSERT(ssk);
130
131	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
132		return (EINVAL);
133	/* rdma_bind_addr handles bind races.  */
134	SDP_WUNLOCK(ssk);
135	if (ssk->id == NULL)
136		ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
137	if (ssk->id == NULL) {
138		SDP_WLOCK(ssk);
139		return (ENOMEM);
140	}
141	if (nam == NULL) {
142		null.sin_family = AF_INET;
143		null.sin_len = sizeof(null);
144		null.sin_addr.s_addr = INADDR_ANY;
145		null.sin_port = 0;
146		bzero(&null.sin_zero, sizeof(null.sin_zero));
147		nam = (struct sockaddr *)&null;
148	}
149	error = -rdma_bind_addr(ssk->id, nam);
150	SDP_WLOCK(ssk);
151	if (error == 0) {
152		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
153		ssk->laddr = sin->sin_addr.s_addr;
154		ssk->lport = sin->sin_port;
155	} else
156		sdp_destroy_cma(ssk);
157	return (error);
158}
159
160static void
161sdp_pcbfree(struct sdp_sock *ssk)
162{
163
164	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
165	KASSERT((ssk->flags & SDP_DESTROY) == 0,
166	    ("ssk %p already destroyed", ssk));
167
168	sdp_dbg(ssk->socket, "Freeing pcb");
169	SDP_WLOCK_ASSERT(ssk);
170	ssk->flags |= SDP_DESTROY;
171	SDP_WUNLOCK(ssk);
172	SDP_LIST_WLOCK();
173	sdp_count--;
174	LIST_REMOVE(ssk, list);
175	SDP_LIST_WUNLOCK();
176	crfree(ssk->cred);
177	ssk->qp_active = 0;
178	if (ssk->qp) {
179		ib_destroy_qp(ssk->qp);
180		ssk->qp = NULL;
181	}
182	sdp_tx_ring_destroy(ssk);
183	sdp_rx_ring_destroy(ssk);
184	sdp_destroy_cma(ssk);
185	rw_destroy(&ssk->rx_ring.destroyed_lock);
186	rw_destroy(&ssk->lock);
187	uma_zfree(sdp_zone, ssk);
188}
189
190/*
191 * Common routines to return a socket address.
192 */
193static struct sockaddr *
194sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
195{
196	struct sockaddr_in *sin;
197
198	sin = malloc(sizeof *sin, M_SONAME,
199		M_WAITOK | M_ZERO);
200	sin->sin_family = AF_INET;
201	sin->sin_len = sizeof(*sin);
202	sin->sin_addr = *addr_p;
203	sin->sin_port = port;
204
205	return (struct sockaddr *)sin;
206}
207
208static int
209sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
210{
211	struct sdp_sock *ssk;
212	struct in_addr addr;
213	in_port_t port;
214
215	ssk = sdp_sk(so);
216	SDP_RLOCK(ssk);
217	port = ssk->lport;
218	addr.s_addr = ssk->laddr;
219	SDP_RUNLOCK(ssk);
220
221	*nam = sdp_sockaddr(port, &addr);
222	return 0;
223}
224
225static int
226sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
227{
228	struct sdp_sock *ssk;
229	struct in_addr addr;
230	in_port_t port;
231
232	ssk = sdp_sk(so);
233	SDP_RLOCK(ssk);
234	port = ssk->fport;
235	addr.s_addr = ssk->faddr;
236	SDP_RUNLOCK(ssk);
237
238	*nam = sdp_sockaddr(port, &addr);
239	return 0;
240}
241
242static void
243sdp_pcbnotifyall(struct in_addr faddr, int errno,
244    struct sdp_sock *(*notify)(struct sdp_sock *, int))
245{
246	struct sdp_sock *ssk, *ssk_temp;
247
248	SDP_LIST_WLOCK();
249	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
250		SDP_WLOCK(ssk);
251		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
252			SDP_WUNLOCK(ssk);
253			continue;
254		}
255		if ((ssk->flags & SDP_DESTROY) == 0)
256			if ((*notify)(ssk, errno))
257				SDP_WUNLOCK(ssk);
258	}
259	SDP_LIST_WUNLOCK();
260}
261
262#if 0
263static void
264sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
265{
266	struct sdp_sock *ssk;
267
268	SDP_LIST_RLOCK();
269	LIST_FOREACH(ssk, &sdp_list, list) {
270		SDP_WLOCK(ssk);
271		func(ssk, arg);
272		SDP_WUNLOCK(ssk);
273	}
274	SDP_LIST_RUNLOCK();
275}
276#endif
277
278static void
279sdp_output_reset(struct sdp_sock *ssk)
280{
281	struct rdma_cm_id *id;
282
283	SDP_WLOCK_ASSERT(ssk);
284	if (ssk->id) {
285		id = ssk->id;
286		ssk->qp_active = 0;
287		SDP_WUNLOCK(ssk);
288		rdma_disconnect(id);
289		SDP_WLOCK(ssk);
290	}
291	ssk->state = TCPS_CLOSED;
292}
293
294/*
295 * Attempt to close a SDP socket, marking it as dropped, and freeing
296 * the socket if we hold the only reference.
297 */
298static struct sdp_sock *
299sdp_closed(struct sdp_sock *ssk)
300{
301	struct socket *so;
302
303	SDP_WLOCK_ASSERT(ssk);
304
305	ssk->flags |= SDP_DROPPED;
306	so = ssk->socket;
307	soisdisconnected(so);
308	if (ssk->flags & SDP_SOCKREF) {
309		KASSERT(so->so_state & SS_PROTOREF,
310		    ("sdp_closed: !SS_PROTOREF"));
311		ssk->flags &= ~SDP_SOCKREF;
312		SDP_WUNLOCK(ssk);
313		ACCEPT_LOCK();
314		SOCK_LOCK(so);
315		so->so_state &= ~SS_PROTOREF;
316		sofree(so);
317		return (NULL);
318	}
319	return (ssk);
320}
321
322/*
323 * Perform timer based shutdowns which can not operate in
324 * callout context.
325 */
326static void
327sdp_shutdown_task(void *data, int pending)
328{
329	struct sdp_sock *ssk;
330
331	ssk = data;
332	SDP_WLOCK(ssk);
333	/*
334	 * I don't think this can race with another call to pcbfree()
335	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
336	 */
337	if (ssk->flags & SDP_DESTROY)
338		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
339		    ssk);
340	if (ssk->flags & SDP_DISCON)
341		sdp_output_reset(ssk);
342	/* We have to clear this so sdp_detach() will call pcbfree(). */
343	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
344	if ((ssk->flags & SDP_DROPPED) == 0 &&
345	    sdp_closed(ssk) == NULL)
346		return;
347	if (ssk->socket == NULL) {
348		sdp_pcbfree(ssk);
349		return;
350	}
351	SDP_WUNLOCK(ssk);
352}
353
354/*
355 * 2msl has expired, schedule the shutdown task.
356 */
357static void
358sdp_2msl_timeout(void *data)
359{
360	struct sdp_sock *ssk;
361
362	ssk = data;
363	/* Callout canceled. */
364        if (!callout_active(&ssk->keep2msl))
365		goto out;
366        callout_deactivate(&ssk->keep2msl);
367	/* Should be impossible, defensive programming. */
368	if ((ssk->flags & SDP_TIMEWAIT) == 0)
369		goto out;
370	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
371out:
372	SDP_WUNLOCK(ssk);
373	return;
374}
375
376/*
377 * Schedule the 2msl wait timer.
378 */
379static void
380sdp_2msl_wait(struct sdp_sock *ssk)
381{
382
383	SDP_WLOCK_ASSERT(ssk);
384	ssk->flags |= SDP_TIMEWAIT;
385	ssk->state = TCPS_TIME_WAIT;
386	soisdisconnected(ssk->socket);
387	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
388}
389
390/*
391 * Timed out waiting for the final fin/ack from rdma_disconnect().
392 */
393static void
394sdp_dreq_timeout(void *data)
395{
396	struct sdp_sock *ssk;
397
398	ssk = data;
399	/* Callout canceled. */
400        if (!callout_active(&ssk->keep2msl))
401		goto out;
402	/* Callout rescheduled, probably as a different timer. */
403	if (callout_pending(&ssk->keep2msl))
404		goto out;
405        callout_deactivate(&ssk->keep2msl);
406	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
407		goto out;
408	if ((ssk->flags & SDP_DREQWAIT) == 0)
409		goto out;
410	ssk->flags &= ~SDP_DREQWAIT;
411	ssk->flags |= SDP_DISCON;
412	sdp_2msl_wait(ssk);
413	ssk->qp_active = 0;
414out:
415	SDP_WUNLOCK(ssk);
416}
417
418/*
419 * Received the final fin/ack.  Cancel the 2msl.
420 */
421void
422sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
423{
424	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
425	ssk->flags &= ~SDP_DREQWAIT;
426	sdp_2msl_wait(ssk);
427}
428
429static int
430sdp_init_sock(struct socket *sk)
431{
432	struct sdp_sock *ssk = sdp_sk(sk);
433
434	sdp_dbg(sk, "%s\n", __func__);
435
436	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
437	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
438#ifdef SDP_ZCOPY
439	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
440	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
441	ssk->tx_ring.rdma_inflight = NULL;
442#endif
443	atomic_set(&ssk->mseq_ack, 0);
444	sdp_rx_ring_init(ssk);
445	ssk->tx_ring.buffer = NULL;
446
447	return 0;
448}
449
450/*
451 * Allocate an sdp_sock for the socket and reserve socket buffer space.
452 */
453static int
454sdp_attach(struct socket *so, int proto, struct thread *td)
455{
456	struct sdp_sock *ssk;
457	int error;
458
459	ssk = sdp_sk(so);
460	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
461	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
462		error = soreserve(so, sdp_sendspace, sdp_recvspace);
463		if (error)
464			return (error);
465	}
466	so->so_rcv.sb_flags |= SB_AUTOSIZE;
467	so->so_snd.sb_flags |= SB_AUTOSIZE;
468	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
469	if (ssk == NULL)
470		return (ENOBUFS);
471	rw_init(&ssk->lock, "sdpsock");
472	ssk->socket = so;
473	ssk->cred = crhold(so->so_cred);
474	so->so_pcb = (caddr_t)ssk;
475	sdp_init_sock(so);
476	ssk->flags = 0;
477	ssk->qp_active = 0;
478	ssk->state = TCPS_CLOSED;
479	mbufq_init(&ssk->rxctlq, INT_MAX);
480	SDP_LIST_WLOCK();
481	LIST_INSERT_HEAD(&sdp_list, ssk, list);
482	sdp_count++;
483	SDP_LIST_WUNLOCK();
484	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
485		so->so_linger = TCP_LINGERTIME;
486
487	return (0);
488}
489
490/*
491 * Detach SDP from the socket, potentially leaving it around for the
492 * timewait to expire.
493 */
494static void
495sdp_detach(struct socket *so)
496{
497	struct sdp_sock *ssk;
498
499	ssk = sdp_sk(so);
500	SDP_WLOCK(ssk);
501	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
502	ssk->socket->so_pcb = NULL;
503	ssk->socket = NULL;
504	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
505		SDP_WUNLOCK(ssk);
506	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
507		sdp_pcbfree(ssk);
508	else
509		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
510}
511
512/*
513 * Allocate a local address for the socket.
514 */
515static int
516sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
517{
518	int error = 0;
519	struct sdp_sock *ssk;
520	struct sockaddr_in *sin;
521
522	sin = (struct sockaddr_in *)nam;
523	if (nam->sa_len != sizeof (*sin))
524		return (EINVAL);
525	if (sin->sin_family != AF_INET)
526		return (EINVAL);
527	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
528		return (EAFNOSUPPORT);
529
530	ssk = sdp_sk(so);
531	SDP_WLOCK(ssk);
532	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
533		error = EINVAL;
534		goto out;
535	}
536	error = sdp_pcbbind(ssk, nam, td->td_ucred);
537out:
538	SDP_WUNLOCK(ssk);
539
540	return (error);
541}
542
543/*
544 * Prepare to accept connections.
545 */
546static int
547sdp_listen(struct socket *so, int backlog, struct thread *td)
548{
549	int error = 0;
550	struct sdp_sock *ssk;
551
552	ssk = sdp_sk(so);
553	SDP_WLOCK(ssk);
554	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
555		error = EINVAL;
556		goto out;
557	}
558	if (error == 0 && ssk->lport == 0)
559		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
560	SOCK_LOCK(so);
561	if (error == 0)
562		error = solisten_proto_check(so);
563	if (error == 0) {
564		solisten_proto(so, backlog);
565		ssk->state = TCPS_LISTEN;
566	}
567	SOCK_UNLOCK(so);
568
569out:
570	SDP_WUNLOCK(ssk);
571	if (error == 0)
572		error = -rdma_listen(ssk->id, backlog);
573	return (error);
574}
575
576/*
577 * Initiate a SDP connection to nam.
578 */
579static int
580sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
581{
582	struct sockaddr_in src;
583	struct socket *so;
584	int error;
585
586	so = ssk->socket;
587
588	SDP_WLOCK_ASSERT(ssk);
589	if (ssk->lport == 0) {
590		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
591		if (error)
592			return error;
593	}
594	src.sin_family = AF_INET;
595	src.sin_len = sizeof(src);
596	bzero(&src.sin_zero, sizeof(src.sin_zero));
597	src.sin_port = ssk->lport;
598	src.sin_addr.s_addr = ssk->laddr;
599	soisconnecting(so);
600	SDP_WUNLOCK(ssk);
601	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
602	    SDP_RESOLVE_TIMEOUT);
603	SDP_WLOCK(ssk);
604	if (error == 0)
605		ssk->state = TCPS_SYN_SENT;
606
607	return 0;
608}
609
610/*
611 * Initiate SDP connection.
612 */
613static int
614sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
615{
616	int error = 0;
617	struct sdp_sock *ssk;
618	struct sockaddr_in *sin;
619
620	sin = (struct sockaddr_in *)nam;
621	if (nam->sa_len != sizeof (*sin))
622		return (EINVAL);
623	if (sin->sin_family != AF_INET)
624		return (EINVAL);
625	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
626		return (EAFNOSUPPORT);
627	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
628		return (error);
629	ssk = sdp_sk(so);
630	SDP_WLOCK(ssk);
631	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
632		error = EINVAL;
633	else
634		error = sdp_start_connect(ssk, nam, td);
635	SDP_WUNLOCK(ssk);
636	return (error);
637}
638
639/*
640 * Drop a SDP socket, reporting
641 * the specified error.  If connection is synchronized,
642 * then send a RST to peer.
643 */
644static struct sdp_sock *
645sdp_drop(struct sdp_sock *ssk, int errno)
646{
647	struct socket *so;
648
649	SDP_WLOCK_ASSERT(ssk);
650	so = ssk->socket;
651	if (TCPS_HAVERCVDSYN(ssk->state))
652		sdp_output_reset(ssk);
653	if (errno == ETIMEDOUT && ssk->softerror)
654		errno = ssk->softerror;
655	so->so_error = errno;
656	return (sdp_closed(ssk));
657}
658
659/*
660 * User issued close, and wish to trail through shutdown states:
661 * if never received SYN, just forget it.  If got a SYN from peer,
662 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
663 * If already got a FIN from peer, then almost done; go to LAST_ACK
664 * state.  In all other cases, have already sent FIN to peer (e.g.
665 * after PRU_SHUTDOWN), and just have to play tedious game waiting
666 * for peer to send FIN or not respond to keep-alives, etc.
667 * We can let the user exit from the close as soon as the FIN is acked.
668 */
669static void
670sdp_usrclosed(struct sdp_sock *ssk)
671{
672
673	SDP_WLOCK_ASSERT(ssk);
674
675	switch (ssk->state) {
676	case TCPS_LISTEN:
677		ssk->state = TCPS_CLOSED;
678		SDP_WUNLOCK(ssk);
679		sdp_destroy_cma(ssk);
680		SDP_WLOCK(ssk);
681		/* FALLTHROUGH */
682	case TCPS_CLOSED:
683		ssk = sdp_closed(ssk);
684		/*
685		 * sdp_closed() should never return NULL here as the socket is
686		 * still open.
687		 */
688		KASSERT(ssk != NULL,
689		    ("sdp_usrclosed: sdp_closed() returned NULL"));
690		break;
691
692	case TCPS_SYN_SENT:
693		/* FALLTHROUGH */
694	case TCPS_SYN_RECEIVED:
695		ssk->flags |= SDP_NEEDFIN;
696		break;
697
698	case TCPS_ESTABLISHED:
699		ssk->flags |= SDP_NEEDFIN;
700		ssk->state = TCPS_FIN_WAIT_1;
701		break;
702
703	case TCPS_CLOSE_WAIT:
704		ssk->state = TCPS_LAST_ACK;
705		break;
706	}
707	if (ssk->state >= TCPS_FIN_WAIT_2) {
708		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
709		if (ssk->state == TCPS_FIN_WAIT_2)
710			sdp_2msl_wait(ssk);
711		else
712			soisdisconnected(ssk->socket);
713	}
714}
715
716static void
717sdp_output_disconnect(struct sdp_sock *ssk)
718{
719
720	SDP_WLOCK_ASSERT(ssk);
721	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
722	    sdp_dreq_timeout, ssk);
723	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
724	sdp_post_sends(ssk, M_NOWAIT);
725}
726
727/*
728 * Initiate or continue a disconnect.
729 * If embryonic state, just send reset (once).
730 * If in ``let data drain'' option and linger null, just drop.
731 * Otherwise (hard), mark socket disconnecting and drop
732 * current input data; switch states based on user close, and
733 * send segment to peer (with FIN).
734 */
735static void
736sdp_start_disconnect(struct sdp_sock *ssk)
737{
738	struct socket *so;
739	int unread;
740
741	so = ssk->socket;
742	SDP_WLOCK_ASSERT(ssk);
743	sdp_stop_keepalive_timer(so);
744	/*
745	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
746	 * socket is still open.
747	 */
748	if (ssk->state < TCPS_ESTABLISHED) {
749		ssk = sdp_closed(ssk);
750		KASSERT(ssk != NULL,
751		    ("sdp_start_disconnect: sdp_close() returned NULL"));
752	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
753		ssk = sdp_drop(ssk, 0);
754		KASSERT(ssk != NULL,
755		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
756	} else {
757		soisdisconnecting(so);
758		unread = sbused(&so->so_rcv);
759		sbflush(&so->so_rcv);
760		sdp_usrclosed(ssk);
761		if (!(ssk->flags & SDP_DROPPED)) {
762			if (unread)
763				sdp_output_reset(ssk);
764			else
765				sdp_output_disconnect(ssk);
766		}
767	}
768}
769
770/*
771 * User initiated disconnect.
772 */
773static int
774sdp_disconnect(struct socket *so)
775{
776	struct sdp_sock *ssk;
777	int error = 0;
778
779	ssk = sdp_sk(so);
780	SDP_WLOCK(ssk);
781	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
782		error = ECONNRESET;
783		goto out;
784	}
785	sdp_start_disconnect(ssk);
786out:
787	SDP_WUNLOCK(ssk);
788	return (error);
789}
790
791/*
792 * Accept a connection.  Essentially all the work is done at higher levels;
793 * just return the address of the peer, storing through addr.
794 *
795 *
796 * XXX This is broken XXX
797 *
798 * The rationale for acquiring the sdp lock here is somewhat complicated,
799 * and is described in detail in the commit log entry for r175612.  Acquiring
800 * it delays an accept(2) racing with sonewconn(), which inserts the socket
801 * before the address/port fields are initialized.  A better fix would
802 * prevent the socket from being placed in the listen queue until all fields
803 * are fully initialized.
804 */
805static int
806sdp_accept(struct socket *so, struct sockaddr **nam)
807{
808	struct sdp_sock *ssk = NULL;
809	struct in_addr addr;
810	in_port_t port;
811	int error;
812
813	if (so->so_state & SS_ISDISCONNECTED)
814		return (ECONNABORTED);
815
816	port = 0;
817	addr.s_addr = 0;
818	error = 0;
819	ssk = sdp_sk(so);
820	SDP_WLOCK(ssk);
821	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
822		error = ECONNABORTED;
823		goto out;
824	}
825	port = ssk->fport;
826	addr.s_addr = ssk->faddr;
827out:
828	SDP_WUNLOCK(ssk);
829	if (error == 0)
830		*nam = sdp_sockaddr(port, &addr);
831	return error;
832}
833
834/*
835 * Mark the connection as being incapable of further output.
836 */
837static int
838sdp_shutdown(struct socket *so)
839{
840	int error = 0;
841	struct sdp_sock *ssk;
842
843	ssk = sdp_sk(so);
844	SDP_WLOCK(ssk);
845	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
846		error = ECONNRESET;
847		goto out;
848	}
849	socantsendmore(so);
850	sdp_usrclosed(ssk);
851	if (!(ssk->flags & SDP_DROPPED))
852		sdp_output_disconnect(ssk);
853
854out:
855	SDP_WUNLOCK(ssk);
856
857	return (error);
858}
859
860static void
861sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
862{
863	struct mbuf *n;
864	int ncnt;
865
866	SOCKBUF_LOCK_ASSERT(sb);
867	SBLASTRECORDCHK(sb);
868	KASSERT(mb->m_flags & M_PKTHDR,
869		("sdp_append: %p Missing packet header.\n", mb));
870	n = sb->sb_lastrecord;
871	/*
872	 * If the queue is empty just set all pointers and proceed.
873	 */
874	if (n == NULL) {
875		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
876		for (; mb; mb = mb->m_next) {
877	                sb->sb_mbtail = mb;
878			sballoc(sb, mb);
879		}
880		return;
881	}
882	/*
883	 * Count the number of mbufs in the current tail.
884	 */
885	for (ncnt = 0; n->m_next; n = n->m_next)
886		ncnt++;
887	n = sb->sb_lastrecord;
888	/*
889	 * If the two chains can fit in a single sdp packet and
890	 * the last record has not been sent yet (WRITABLE) coalesce
891	 * them.  The lastrecord remains the same but we must strip the
892	 * packet header and then let sbcompress do the hard part.
893	 */
894	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
895	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
896	    ssk->xmit_size_goal) {
897		m_adj(mb, SDP_HEAD_SIZE);
898		n->m_pkthdr.len += mb->m_pkthdr.len;
899		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
900		m_demote(mb, 1, 0);
901		sbcompress(sb, mb, sb->sb_mbtail);
902		return;
903	}
904	/*
905	 * Not compressible, just append to the end and adjust counters.
906	 */
907	sb->sb_lastrecord->m_flags |= M_PUSH;
908	sb->sb_lastrecord->m_nextpkt = mb;
909	sb->sb_lastrecord = mb;
910	if (sb->sb_sndptr == NULL)
911		sb->sb_sndptr = mb;
912	for (; mb; mb = mb->m_next) {
913		sb->sb_mbtail = mb;
914		sballoc(sb, mb);
915	}
916}
917
918/*
919 * Do a send by putting data in output queue and updating urgent
920 * marker if URG set.  Possibly send more data.  Unlike the other
921 * pru_*() routines, the mbuf chains are our responsibility.  We
922 * must either enqueue them or free them.  The other pru_* routines
923 * generally are caller-frees.
924 *
925 * This comes from sendfile, normal sends will come from sdp_sosend().
926 */
927static int
928sdp_send(struct socket *so, int flags, struct mbuf *m,
929    struct sockaddr *nam, struct mbuf *control, struct thread *td)
930{
931	struct sdp_sock *ssk;
932	struct mbuf *n;
933	int error;
934	int cnt;
935
936	error = 0;
937	ssk = sdp_sk(so);
938	KASSERT(m->m_flags & M_PKTHDR,
939	    ("sdp_send: %p no packet header", m));
940	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
941	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
942	for (n = m, cnt = 0; n->m_next; n = n->m_next)
943		cnt++;
944	if (cnt > SDP_MAX_SEND_SGES) {
945		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
946		if (n == NULL) {
947			m_freem(m);
948			return (EMSGSIZE);
949		}
950		m = n;
951		for (cnt = 0; n->m_next; n = n->m_next)
952			cnt++;
953	}
954	SDP_WLOCK(ssk);
955	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
956		if (control)
957			m_freem(control);
958		if (m)
959			m_freem(m);
960		error = ECONNRESET;
961		goto out;
962	}
963	if (control) {
964		/* SDP doesn't support control messages. */
965		if (control->m_len) {
966			m_freem(control);
967			if (m)
968				m_freem(m);
969			error = EINVAL;
970			goto out;
971		}
972		m_freem(control);	/* empty control, just free it */
973	}
974	if (!(flags & PRUS_OOB)) {
975		SOCKBUF_LOCK(&so->so_snd);
976		sdp_append(ssk, &so->so_snd, m, cnt);
977		SOCKBUF_UNLOCK(&so->so_snd);
978		if (nam && ssk->state < TCPS_SYN_SENT) {
979			/*
980			 * Do implied connect if not yet connected.
981			 */
982			error = sdp_start_connect(ssk, nam, td);
983			if (error)
984				goto out;
985		}
986		if (flags & PRUS_EOF) {
987			/*
988			 * Close the send side of the connection after
989			 * the data is sent.
990			 */
991			socantsendmore(so);
992			sdp_usrclosed(ssk);
993			if (!(ssk->flags & SDP_DROPPED))
994				sdp_output_disconnect(ssk);
995		} else if (!(ssk->flags & SDP_DROPPED) &&
996		    !(flags & PRUS_MORETOCOME))
997			sdp_post_sends(ssk, M_NOWAIT);
998		SDP_WUNLOCK(ssk);
999		return (0);
1000	} else {
1001		SOCKBUF_LOCK(&so->so_snd);
1002		if (sbspace(&so->so_snd) < -512) {
1003			SOCKBUF_UNLOCK(&so->so_snd);
1004			m_freem(m);
1005			error = ENOBUFS;
1006			goto out;
1007		}
1008		/*
1009		 * According to RFC961 (Assigned Protocols),
1010		 * the urgent pointer points to the last octet
1011		 * of urgent data.  We continue, however,
1012		 * to consider it to indicate the first octet
1013		 * of data past the urgent section.
1014		 * Otherwise, snd_up should be one lower.
1015		 */
1016		m->m_flags |= M_URG | M_PUSH;
1017		sdp_append(ssk, &so->so_snd, m, cnt);
1018		SOCKBUF_UNLOCK(&so->so_snd);
1019		if (nam && ssk->state < TCPS_SYN_SENT) {
1020			/*
1021			 * Do implied connect if not yet connected.
1022			 */
1023			error = sdp_start_connect(ssk, nam, td);
1024			if (error)
1025				goto out;
1026		}
1027		sdp_post_sends(ssk, M_NOWAIT);
1028		SDP_WUNLOCK(ssk);
1029		return (0);
1030	}
1031out:
1032	SDP_WUNLOCK(ssk);
1033	return (error);
1034}
1035
1036#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1037
1038/*
1039 * Send on a socket.  If send must go all at once and message is larger than
1040 * send buffering, then hard error.  Lock against other senders.  If must go
1041 * all at once and not enough room now, then inform user that this would
1042 * block and do nothing.  Otherwise, if nonblocking, send as much as
1043 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1044 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1045 * in mbuf chain must be small enough to send all at once.
1046 *
1047 * Returns nonzero on error, timeout or signal; callers must check for short
1048 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1049 * on return.
1050 */
1051static int
1052sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1053    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1054{
1055	struct sdp_sock *ssk;
1056	long space, resid;
1057	int atomic;
1058	int error;
1059	int copy;
1060
1061	if (uio != NULL)
1062		resid = uio->uio_resid;
1063	else
1064		resid = top->m_pkthdr.len;
1065	atomic = top != NULL;
1066	if (control != NULL) {
1067		if (control->m_len) {
1068			m_freem(control);
1069			if (top)
1070				m_freem(top);
1071			return (EINVAL);
1072		}
1073		m_freem(control);
1074		control = NULL;
1075	}
1076	/*
1077	 * In theory resid should be unsigned.  However, space must be
1078	 * signed, as it might be less than 0 if we over-committed, and we
1079	 * must use a signed comparison of space and resid.  On the other
1080	 * hand, a negative resid causes us to loop sending 0-length
1081	 * segments to the protocol.
1082	 *
1083	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1084	 * type sockets since that's an error.
1085	 */
1086	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1087		error = EINVAL;
1088		goto out;
1089	}
1090	if (td != NULL)
1091		td->td_ru.ru_msgsnd++;
1092
1093	ssk = sdp_sk(so);
1094	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1095	if (error)
1096		goto out;
1097
1098restart:
1099	do {
1100		SOCKBUF_LOCK(&so->so_snd);
1101		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1102			SOCKBUF_UNLOCK(&so->so_snd);
1103			error = EPIPE;
1104			goto release;
1105		}
1106		if (so->so_error) {
1107			error = so->so_error;
1108			so->so_error = 0;
1109			SOCKBUF_UNLOCK(&so->so_snd);
1110			goto release;
1111		}
1112		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1113			SOCKBUF_UNLOCK(&so->so_snd);
1114			error = ENOTCONN;
1115			goto release;
1116		}
1117		space = sbspace(&so->so_snd);
1118		if (flags & MSG_OOB)
1119			space += 1024;
1120		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1121			SOCKBUF_UNLOCK(&so->so_snd);
1122			error = EMSGSIZE;
1123			goto release;
1124		}
1125		if (space < resid &&
1126		    (atomic || space < so->so_snd.sb_lowat)) {
1127			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1128				SOCKBUF_UNLOCK(&so->so_snd);
1129				error = EWOULDBLOCK;
1130				goto release;
1131			}
1132			error = sbwait(&so->so_snd);
1133			SOCKBUF_UNLOCK(&so->so_snd);
1134			if (error)
1135				goto release;
1136			goto restart;
1137		}
1138		SOCKBUF_UNLOCK(&so->so_snd);
1139		do {
1140			if (uio == NULL) {
1141				resid = 0;
1142				if (flags & MSG_EOR)
1143					top->m_flags |= M_EOR;
1144			} else {
1145				/*
1146				 * Copy the data from userland into a mbuf
1147				 * chain.  If no data is to be copied in,
1148				 * a single empty mbuf is returned.
1149				 */
1150				copy = min(space,
1151				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1152				top = m_uiotombuf(uio, M_WAITOK, copy,
1153				    0, M_PKTHDR |
1154				    ((flags & MSG_EOR) ? M_EOR : 0));
1155				if (top == NULL) {
1156					/* only possible error */
1157					error = EFAULT;
1158					goto release;
1159				}
1160				space -= resid - uio->uio_resid;
1161				resid = uio->uio_resid;
1162			}
1163			/*
1164			 * XXX all the SBS_CANTSENDMORE checks previously
1165			 * done could be out of date after dropping the
1166			 * socket lock.
1167			 */
1168			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1169			/*
1170			 * Set EOF on the last send if the user specified
1171			 * MSG_EOF.
1172			 */
1173			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1174			/* If there is more to send set PRUS_MORETOCOME. */
1175			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1176			    top, addr, NULL, td);
1177			top = NULL;
1178			if (error)
1179				goto release;
1180		} while (resid && space > 0);
1181	} while (resid);
1182
1183release:
1184	sbunlock(&so->so_snd);
1185out:
1186	if (top != NULL)
1187		m_freem(top);
1188	return (error);
1189}
1190
1191/*
1192 * The part of soreceive() that implements reading non-inline out-of-band
1193 * data from a socket.  For more complete comments, see soreceive(), from
1194 * which this code originated.
1195 *
1196 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1197 * unable to return an mbuf chain to the caller.
1198 */
1199static int
1200soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1201{
1202	struct protosw *pr = so->so_proto;
1203	struct mbuf *m;
1204	int error;
1205
1206	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1207
1208	m = m_get(M_WAITOK, MT_DATA);
1209	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1210	if (error)
1211		goto bad;
1212	do {
1213		error = uiomove(mtod(m, void *),
1214		    (int) min(uio->uio_resid, m->m_len), uio);
1215		m = m_free(m);
1216	} while (uio->uio_resid && error == 0 && m);
1217bad:
1218	if (m != NULL)
1219		m_freem(m);
1220	return (error);
1221}
1222
1223/*
1224 * Optimized version of soreceive() for stream (TCP) sockets.
1225 */
1226static int
1227sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1228    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1229{
1230	int len = 0, error = 0, flags, oresid;
1231	struct sockbuf *sb;
1232	struct mbuf *m, *n = NULL;
1233	struct sdp_sock *ssk;
1234
1235	/* We only do stream sockets. */
1236	if (so->so_type != SOCK_STREAM)
1237		return (EINVAL);
1238	if (psa != NULL)
1239		*psa = NULL;
1240	if (controlp != NULL)
1241		return (EINVAL);
1242	if (flagsp != NULL)
1243		flags = *flagsp &~ MSG_EOR;
1244	else
1245		flags = 0;
1246	if (flags & MSG_OOB)
1247		return (soreceive_rcvoob(so, uio, flags));
1248	if (mp0 != NULL)
1249		*mp0 = NULL;
1250
1251	sb = &so->so_rcv;
1252	ssk = sdp_sk(so);
1253
1254	/* Prevent other readers from entering the socket. */
1255	error = sblock(sb, SBLOCKWAIT(flags));
1256	if (error)
1257		goto out;
1258	SOCKBUF_LOCK(sb);
1259
1260	/* Easy one, no space to copyout anything. */
1261	if (uio->uio_resid == 0) {
1262		error = EINVAL;
1263		goto out;
1264	}
1265	oresid = uio->uio_resid;
1266
1267	/* We will never ever get anything unless we are connected. */
1268	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1269		/* When disconnecting there may be still some data left. */
1270		if (sbavail(sb))
1271			goto deliver;
1272		if (!(so->so_state & SS_ISDISCONNECTED))
1273			error = ENOTCONN;
1274		goto out;
1275	}
1276
1277	/* Socket buffer is empty and we shall not block. */
1278	if (sbavail(sb) == 0 &&
1279	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1280		error = EAGAIN;
1281		goto out;
1282	}
1283
1284restart:
1285	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1286
1287	/* Abort if socket has reported problems. */
1288	if (so->so_error) {
1289		if (sbavail(sb))
1290			goto deliver;
1291		if (oresid > uio->uio_resid)
1292			goto out;
1293		error = so->so_error;
1294		if (!(flags & MSG_PEEK))
1295			so->so_error = 0;
1296		goto out;
1297	}
1298
1299	/* Door is closed.  Deliver what is left, if any. */
1300	if (sb->sb_state & SBS_CANTRCVMORE) {
1301		if (sbavail(sb))
1302			goto deliver;
1303		else
1304			goto out;
1305	}
1306
1307	/* Socket buffer got some data that we shall deliver now. */
1308	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1309	    ((so->so_state & SS_NBIO) ||
1310	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1311	     sbavail(sb) >= sb->sb_lowat ||
1312	     sbavail(sb) >= uio->uio_resid ||
1313	     sbavail(sb) >= sb->sb_hiwat) ) {
1314		goto deliver;
1315	}
1316
1317	/* On MSG_WAITALL we must wait until all data or error arrives. */
1318	if ((flags & MSG_WAITALL) &&
1319	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1320		goto deliver;
1321
1322	/*
1323	 * Wait and block until (more) data comes in.
1324	 * NB: Drops the sockbuf lock during wait.
1325	 */
1326	error = sbwait(sb);
1327	if (error)
1328		goto out;
1329	goto restart;
1330
1331deliver:
1332	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1333	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1334	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1335
1336	/* Statistics. */
1337	if (uio->uio_td)
1338		uio->uio_td->td_ru.ru_msgrcv++;
1339
1340	/* Fill uio until full or current end of socket buffer is reached. */
1341	len = min(uio->uio_resid, sbavail(sb));
1342	if (mp0 != NULL) {
1343		/* Dequeue as many mbufs as possible. */
1344		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1345			for (*mp0 = m = sb->sb_mb;
1346			     m != NULL && m->m_len <= len;
1347			     m = m->m_next) {
1348				len -= m->m_len;
1349				uio->uio_resid -= m->m_len;
1350				sbfree(sb, m);
1351				n = m;
1352			}
1353			sb->sb_mb = m;
1354			if (sb->sb_mb == NULL)
1355				SB_EMPTY_FIXUP(sb);
1356			n->m_next = NULL;
1357		}
1358		/* Copy the remainder. */
1359		if (len > 0) {
1360			KASSERT(sb->sb_mb != NULL,
1361			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1362
1363			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1364			if (m == NULL)
1365				len = 0;	/* Don't flush data from sockbuf. */
1366			else
1367				uio->uio_resid -= m->m_len;
1368			if (*mp0 != NULL)
1369				n->m_next = m;
1370			else
1371				*mp0 = m;
1372			if (*mp0 == NULL) {
1373				error = ENOBUFS;
1374				goto out;
1375			}
1376		}
1377	} else {
1378		/* NB: Must unlock socket buffer as uiomove may sleep. */
1379		SOCKBUF_UNLOCK(sb);
1380		error = m_mbuftouio(uio, sb->sb_mb, len);
1381		SOCKBUF_LOCK(sb);
1382		if (error)
1383			goto out;
1384	}
1385	SBLASTRECORDCHK(sb);
1386	SBLASTMBUFCHK(sb);
1387
1388	/*
1389	 * Remove the delivered data from the socket buffer unless we
1390	 * were only peeking.
1391	 */
1392	if (!(flags & MSG_PEEK)) {
1393		if (len > 0)
1394			sbdrop_locked(sb, len);
1395
1396		/* Notify protocol that we drained some data. */
1397		SOCKBUF_UNLOCK(sb);
1398		SDP_WLOCK(ssk);
1399		sdp_do_posts(ssk);
1400		SDP_WUNLOCK(ssk);
1401		SOCKBUF_LOCK(sb);
1402	}
1403
1404	/*
1405	 * For MSG_WAITALL we may have to loop again and wait for
1406	 * more data to come in.
1407	 */
1408	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1409		goto restart;
1410out:
1411	SOCKBUF_LOCK_ASSERT(sb);
1412	SBLASTRECORDCHK(sb);
1413	SBLASTMBUFCHK(sb);
1414	SOCKBUF_UNLOCK(sb);
1415	sbunlock(sb);
1416	return (error);
1417}
1418
1419/*
1420 * Abort is used to teardown a connection typically while sitting in
1421 * the accept queue.
1422 */
1423void
1424sdp_abort(struct socket *so)
1425{
1426	struct sdp_sock *ssk;
1427
1428	ssk = sdp_sk(so);
1429	SDP_WLOCK(ssk);
1430	/*
1431	 * If we have not yet dropped, do it now.
1432	 */
1433	if (!(ssk->flags & SDP_TIMEWAIT) &&
1434	    !(ssk->flags & SDP_DROPPED))
1435		sdp_drop(ssk, ECONNABORTED);
1436	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1437	    ssk, ssk->flags));
1438	SDP_WUNLOCK(ssk);
1439}
1440
1441/*
1442 * Close a SDP socket and initiate a friendly disconnect.
1443 */
1444static void
1445sdp_close(struct socket *so)
1446{
1447	struct sdp_sock *ssk;
1448
1449	ssk = sdp_sk(so);
1450	SDP_WLOCK(ssk);
1451	/*
1452	 * If we have not yet dropped, do it now.
1453	 */
1454	if (!(ssk->flags & SDP_TIMEWAIT) &&
1455	    !(ssk->flags & SDP_DROPPED))
1456		sdp_start_disconnect(ssk);
1457
1458	/*
1459	 * If we've still not dropped let the socket layer know we're
1460	 * holding on to the socket and pcb for a while.
1461	 */
1462	if (!(ssk->flags & SDP_DROPPED)) {
1463		SOCK_LOCK(so);
1464		so->so_state |= SS_PROTOREF;
1465		SOCK_UNLOCK(so);
1466		ssk->flags |= SDP_SOCKREF;
1467	}
1468	SDP_WUNLOCK(ssk);
1469}
1470
1471/*
1472 * User requests out-of-band data.
1473 */
1474static int
1475sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1476{
1477	int error = 0;
1478	struct sdp_sock *ssk;
1479
1480	ssk = sdp_sk(so);
1481	SDP_WLOCK(ssk);
1482	if (!rx_ring_trylock(&ssk->rx_ring)) {
1483		SDP_WUNLOCK(ssk);
1484		return (ECONNRESET);
1485	}
1486	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1487		error = ECONNRESET;
1488		goto out;
1489	}
1490	if ((so->so_oobmark == 0 &&
1491	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1492	    so->so_options & SO_OOBINLINE ||
1493	    ssk->oobflags & SDP_HADOOB) {
1494		error = EINVAL;
1495		goto out;
1496	}
1497	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1498		error = EWOULDBLOCK;
1499		goto out;
1500	}
1501	m->m_len = 1;
1502	*mtod(m, caddr_t) = ssk->iobc;
1503	if ((flags & MSG_PEEK) == 0)
1504		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1505out:
1506	rx_ring_unlock(&ssk->rx_ring);
1507	SDP_WUNLOCK(ssk);
1508	return (error);
1509}
1510
1511void
1512sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1513{
1514	struct mbuf *m;
1515	struct socket *so;
1516
1517	so = ssk->socket;
1518	if (so == NULL)
1519		return;
1520
1521	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1522	sohasoutofband(so);
1523	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1524	if (!(so->so_options & SO_OOBINLINE)) {
1525		for (m = mb; m->m_next != NULL; m = m->m_next);
1526		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1527		ssk->oobflags |= SDP_HAVEOOB;
1528		m->m_len--;
1529		mb->m_pkthdr.len--;
1530	}
1531}
1532
1533/*
1534 * Notify a sdp socket of an asynchronous error.
1535 *
1536 * Do not wake up user since there currently is no mechanism for
1537 * reporting soft errors (yet - a kqueue filter may be added).
1538 */
1539struct sdp_sock *
1540sdp_notify(struct sdp_sock *ssk, int error)
1541{
1542
1543	SDP_WLOCK_ASSERT(ssk);
1544
1545	if ((ssk->flags & SDP_TIMEWAIT) ||
1546	    (ssk->flags & SDP_DROPPED))
1547		return (ssk);
1548
1549	/*
1550	 * Ignore some errors if we are hooked up.
1551	 */
1552	if (ssk->state == TCPS_ESTABLISHED &&
1553	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1554	     error == EHOSTDOWN))
1555		return (ssk);
1556	ssk->softerror = error;
1557	return sdp_drop(ssk, error);
1558}
1559
1560static void
1561sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1562{
1563	struct in_addr faddr;
1564
1565	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1566	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1567		return;
1568
1569	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1570}
1571
1572static int
1573sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1574    struct thread *td)
1575{
1576	return (EOPNOTSUPP);
1577}
1578
1579static void
1580sdp_keepalive_timeout(void *data)
1581{
1582	struct sdp_sock *ssk;
1583
1584	ssk = data;
1585	/* Callout canceled. */
1586        if (!callout_active(&ssk->keep2msl))
1587                return;
1588	/* Callout rescheduled as a different kind of timer. */
1589	if (callout_pending(&ssk->keep2msl))
1590		goto out;
1591        callout_deactivate(&ssk->keep2msl);
1592	if (ssk->flags & SDP_DROPPED ||
1593	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1594		goto out;
1595	sdp_post_keepalive(ssk);
1596	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1597	    sdp_keepalive_timeout, ssk);
1598out:
1599	SDP_WUNLOCK(ssk);
1600}
1601
1602
1603void
1604sdp_start_keepalive_timer(struct socket *so)
1605{
1606	struct sdp_sock *ssk;
1607
1608	ssk = sdp_sk(so);
1609	if (!callout_pending(&ssk->keep2msl))
1610                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1611                    sdp_keepalive_timeout, ssk);
1612}
1613
1614static void
1615sdp_stop_keepalive_timer(struct socket *so)
1616{
1617	struct sdp_sock *ssk;
1618
1619	ssk = sdp_sk(so);
1620	callout_stop(&ssk->keep2msl);
1621}
1622
1623/*
1624 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1625 * socket option arguments.  When it re-acquires the lock after the copy, it
1626 * has to revalidate that the connection is still valid for the socket
1627 * option.
1628 */
1629#define SDP_WLOCK_RECHECK(inp) do {					\
1630	SDP_WLOCK(ssk);							\
1631	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1632		SDP_WUNLOCK(ssk);					\
1633		return (ECONNRESET);					\
1634	}								\
1635} while(0)
1636
1637static int
1638sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1639{
1640	int	error, opt, optval;
1641	struct sdp_sock *ssk;
1642
1643	error = 0;
1644	ssk = sdp_sk(so);
1645	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1646		SDP_WLOCK(ssk);
1647		if (so->so_options & SO_KEEPALIVE)
1648			sdp_start_keepalive_timer(so);
1649		else
1650			sdp_stop_keepalive_timer(so);
1651		SDP_WUNLOCK(ssk);
1652	}
1653	if (sopt->sopt_level != IPPROTO_TCP)
1654		return (error);
1655
1656	SDP_WLOCK(ssk);
1657	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1658		SDP_WUNLOCK(ssk);
1659		return (ECONNRESET);
1660	}
1661
1662	switch (sopt->sopt_dir) {
1663	case SOPT_SET:
1664		switch (sopt->sopt_name) {
1665		case TCP_NODELAY:
1666			SDP_WUNLOCK(ssk);
1667			error = sooptcopyin(sopt, &optval, sizeof optval,
1668			    sizeof optval);
1669			if (error)
1670				return (error);
1671
1672			SDP_WLOCK_RECHECK(ssk);
1673			opt = SDP_NODELAY;
1674			if (optval)
1675				ssk->flags |= opt;
1676			else
1677				ssk->flags &= ~opt;
1678			sdp_do_posts(ssk);
1679			SDP_WUNLOCK(ssk);
1680			break;
1681
1682		default:
1683			SDP_WUNLOCK(ssk);
1684			error = ENOPROTOOPT;
1685			break;
1686		}
1687		break;
1688
1689	case SOPT_GET:
1690		switch (sopt->sopt_name) {
1691		case TCP_NODELAY:
1692			optval = ssk->flags & SDP_NODELAY;
1693			SDP_WUNLOCK(ssk);
1694			error = sooptcopyout(sopt, &optval, sizeof optval);
1695			break;
1696		default:
1697			SDP_WUNLOCK(ssk);
1698			error = ENOPROTOOPT;
1699			break;
1700		}
1701		break;
1702	}
1703	return (error);
1704}
1705#undef SDP_WLOCK_RECHECK
1706
1707int sdp_mod_count = 0;
1708int sdp_mod_usec = 0;
1709
1710void
1711sdp_set_default_moderation(struct sdp_sock *ssk)
1712{
1713	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1714		return;
1715	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1716}
1717
1718static void
1719sdp_dev_add(struct ib_device *device)
1720{
1721	struct ib_fmr_pool_param param;
1722	struct sdp_device *sdp_dev;
1723
1724	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1725	sdp_dev->pd = ib_alloc_pd(device, 0);
1726	if (IS_ERR(sdp_dev->pd))
1727		goto out_pd;
1728	memset(&param, 0, sizeof param);
1729	param.max_pages_per_fmr = SDP_FMR_SIZE;
1730	param.page_shift = PAGE_SHIFT;
1731	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1732	param.pool_size = SDP_FMR_POOL_SIZE;
1733	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1734	param.cache = 1;
1735	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1736	if (IS_ERR(sdp_dev->fmr_pool))
1737		goto out_fmr;
1738	ib_set_client_data(device, &sdp_client, sdp_dev);
1739	return;
1740
1741out_fmr:
1742	ib_dealloc_pd(sdp_dev->pd);
1743out_pd:
1744	free(sdp_dev, M_SDP);
1745}
1746
1747static void
1748sdp_dev_rem(struct ib_device *device, void *client_data)
1749{
1750	struct sdp_device *sdp_dev;
1751	struct sdp_sock *ssk;
1752
1753	SDP_LIST_WLOCK();
1754	LIST_FOREACH(ssk, &sdp_list, list) {
1755		if (ssk->ib_device != device)
1756			continue;
1757		SDP_WLOCK(ssk);
1758		if ((ssk->flags & SDP_DESTROY) == 0)
1759			ssk = sdp_notify(ssk, ECONNRESET);
1760		if (ssk)
1761			SDP_WUNLOCK(ssk);
1762	}
1763	SDP_LIST_WUNLOCK();
1764	/*
1765	 * XXX Do I need to wait between these two?
1766	 */
1767	sdp_dev = ib_get_client_data(device, &sdp_client);
1768	if (!sdp_dev)
1769		return;
1770	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1771	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1772	ib_dealloc_pd(sdp_dev->pd);
1773	free(sdp_dev, M_SDP);
1774}
1775
1776struct ib_client sdp_client =
1777    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1778
1779
1780static int
1781sdp_pcblist(SYSCTL_HANDLER_ARGS)
1782{
1783	int error, n, i;
1784	struct sdp_sock *ssk;
1785	struct xinpgen xig;
1786
1787	/*
1788	 * The process of preparing the TCB list is too time-consuming and
1789	 * resource-intensive to repeat twice on every request.
1790	 */
1791	if (req->oldptr == NULL) {
1792		n = sdp_count;
1793		n += imax(n / 8, 10);
1794		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1795		return (0);
1796	}
1797
1798	if (req->newptr != NULL)
1799		return (EPERM);
1800
1801	/*
1802	 * OK, now we're committed to doing something.
1803	 */
1804	SDP_LIST_RLOCK();
1805	n = sdp_count;
1806	SDP_LIST_RUNLOCK();
1807
1808	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1809		+ n * sizeof(struct xtcpcb));
1810	if (error != 0)
1811		return (error);
1812
1813	xig.xig_len = sizeof xig;
1814	xig.xig_count = n;
1815	xig.xig_gen = 0;
1816	xig.xig_sogen = so_gencnt;
1817	error = SYSCTL_OUT(req, &xig, sizeof xig);
1818	if (error)
1819		return (error);
1820
1821	SDP_LIST_RLOCK();
1822	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1823	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1824		struct xtcpcb xt;
1825
1826		SDP_RLOCK(ssk);
1827		if (ssk->flags & SDP_TIMEWAIT) {
1828			if (ssk->cred != NULL)
1829				error = cr_cansee(req->td->td_ucred,
1830				    ssk->cred);
1831			else
1832				error = EINVAL;	/* Skip this inp. */
1833		} else if (ssk->socket)
1834			error = cr_canseesocket(req->td->td_ucred,
1835			    ssk->socket);
1836		else
1837			error = EINVAL;
1838		if (error) {
1839			error = 0;
1840			goto next;
1841		}
1842
1843		bzero(&xt, sizeof(xt));
1844		xt.xt_len = sizeof xt;
1845		xt.xt_inp.inp_gencnt = 0;
1846		xt.xt_inp.inp_vflag = INP_IPV4;
1847		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1848		xt.xt_inp.inp_lport = ssk->lport;
1849		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1850		xt.xt_inp.inp_fport = ssk->fport;
1851		xt.xt_tp.t_state = ssk->state;
1852		if (ssk->socket != NULL)
1853			sotoxsocket(ssk->socket, &xt.xt_socket);
1854		xt.xt_socket.xso_protocol = IPPROTO_TCP;
1855		SDP_RUNLOCK(ssk);
1856		error = SYSCTL_OUT(req, &xt, sizeof xt);
1857		if (error)
1858			break;
1859		i++;
1860		continue;
1861next:
1862		SDP_RUNLOCK(ssk);
1863	}
1864	if (!error) {
1865		/*
1866		 * Give the user an updated idea of our state.
1867		 * If the generation differs from what we told
1868		 * her before, she knows that something happened
1869		 * while we were processing this request, and it
1870		 * might be necessary to retry.
1871		 */
1872		xig.xig_gen = 0;
1873		xig.xig_sogen = so_gencnt;
1874		xig.xig_count = sdp_count;
1875		error = SYSCTL_OUT(req, &xig, sizeof xig);
1876	}
1877	SDP_LIST_RUNLOCK();
1878	return (error);
1879}
1880
1881static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1882
1883SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1884    CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1885    "List of active SDP connections");
1886
1887static void
1888sdp_zone_change(void *tag)
1889{
1890
1891	uma_zone_set_max(sdp_zone, maxsockets);
1892}
1893
1894static void
1895sdp_init(void)
1896{
1897
1898	LIST_INIT(&sdp_list);
1899	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1900	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1901	uma_zone_set_max(sdp_zone, maxsockets);
1902	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1903		EVENTHANDLER_PRI_ANY);
1904	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1905	ib_register_client(&sdp_client);
1906}
1907
1908extern struct domain sdpdomain;
1909
1910struct pr_usrreqs sdp_usrreqs = {
1911	.pru_abort =		sdp_abort,
1912	.pru_accept =		sdp_accept,
1913	.pru_attach =		sdp_attach,
1914	.pru_bind =		sdp_bind,
1915	.pru_connect =		sdp_connect,
1916	.pru_control =		sdp_control,
1917	.pru_detach =		sdp_detach,
1918	.pru_disconnect =	sdp_disconnect,
1919	.pru_listen =		sdp_listen,
1920	.pru_peeraddr =		sdp_getpeeraddr,
1921	.pru_rcvoob =		sdp_rcvoob,
1922	.pru_send =		sdp_send,
1923	.pru_sosend =		sdp_sosend,
1924	.pru_soreceive =	sdp_sorecv,
1925	.pru_shutdown =		sdp_shutdown,
1926	.pru_sockaddr =		sdp_getsockaddr,
1927	.pru_close =		sdp_close,
1928};
1929
1930struct protosw sdpsw[] = {
1931{
1932	.pr_type =		SOCK_STREAM,
1933	.pr_domain =		&sdpdomain,
1934	.pr_protocol =		IPPROTO_IP,
1935	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1936	.pr_ctlinput =		sdp_ctlinput,
1937	.pr_ctloutput =		sdp_ctloutput,
1938	.pr_usrreqs =		&sdp_usrreqs
1939},
1940{
1941	.pr_type =		SOCK_STREAM,
1942	.pr_domain =		&sdpdomain,
1943	.pr_protocol =		IPPROTO_TCP,
1944	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1945	.pr_ctlinput =		sdp_ctlinput,
1946	.pr_ctloutput =		sdp_ctloutput,
1947	.pr_usrreqs =		&sdp_usrreqs
1948},
1949};
1950
1951struct domain sdpdomain = {
1952	.dom_family =		AF_INET_SDP,
1953	.dom_name =		"SDP",
1954	.dom_init =		sdp_init,
1955	.dom_protosw =		sdpsw,
1956	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1957};
1958
1959DOMAIN_SET(sdp);
1960
1961int sdp_debug_level = 1;
1962int sdp_data_debug_level = 0;
1963