1
2/*-
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
4 *      The Regents of the University of California.  All rights reserved.
5 * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
6 * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
33 */
34
35/*
36 *
37 * Copyright (c) 2010 Isilon Systems, Inc.
38 * Copyright (c) 2010 iX Systems, Inc.
39 * Copyright (c) 2010 Panasas, Inc.
40 * All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 *    notice unmodified, this list of conditions, and the following
47 *    disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 *    notice, this list of conditions and the following disclaimer in the
50 *    documentation and/or other materials provided with the distribution.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
53 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
57 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62 *
63 */
64#include <sys/cdefs.h>
65__FBSDID("$FreeBSD$");
66
67#include "sdp.h"
68
69#include <net/if.h>
70#include <net/route.h>
71#include <net/vnet.h>
72
73uma_zone_t	sdp_zone;
74struct rwlock	sdp_lock;
75LIST_HEAD(, sdp_sock) sdp_list;
76
77struct workqueue_struct *rx_comp_wq;
78
79RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
80#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
81#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
82#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
83#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
84#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
85#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
86#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
87
88static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
89
90static void sdp_stop_keepalive_timer(struct socket *so);
91
92/*
93 * SDP protocol interface to socket abstraction.
94 */
95/*
96 * sdp_sendspace and sdp_recvspace are the default send and receive window
97 * sizes, respectively.
98 */
99u_long	sdp_sendspace = 1024*32;
100u_long	sdp_recvspace = 1024*64;
101
102static int sdp_count;
103
104/*
105 * Disable async. CMA events for sockets which are being torn down.
106 */
107static void
108sdp_destroy_cma(struct sdp_sock *ssk)
109{
110
111	if (ssk->id == NULL)
112		return;
113	rdma_destroy_id(ssk->id);
114	ssk->id = NULL;
115}
116
117static int
118sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
119{
120	struct sockaddr_in *sin;
121	struct sockaddr_in null;
122	int error;
123
124	SDP_WLOCK_ASSERT(ssk);
125
126	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
127		return (EINVAL);
128	/* rdma_bind_addr handles bind races.  */
129	SDP_WUNLOCK(ssk);
130	if (ssk->id == NULL)
131		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP);
132	if (ssk->id == NULL) {
133		SDP_WLOCK(ssk);
134		return (ENOMEM);
135	}
136	if (nam == NULL) {
137		null.sin_family = AF_INET;
138		null.sin_len = sizeof(null);
139		null.sin_addr.s_addr = INADDR_ANY;
140		null.sin_port = 0;
141		bzero(&null.sin_zero, sizeof(null.sin_zero));
142		nam = (struct sockaddr *)&null;
143	}
144	error = -rdma_bind_addr(ssk->id, nam);
145	SDP_WLOCK(ssk);
146	if (error == 0) {
147		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
148		ssk->laddr = sin->sin_addr.s_addr;
149		ssk->lport = sin->sin_port;
150	} else
151		sdp_destroy_cma(ssk);
152	return (error);
153}
154
155static void
156sdp_pcbfree(struct sdp_sock *ssk)
157{
158	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
159
160	sdp_dbg(ssk->socket, "Freeing pcb");
161	SDP_WLOCK_ASSERT(ssk);
162	ssk->flags |= SDP_DESTROY;
163	SDP_WUNLOCK(ssk);
164	SDP_LIST_WLOCK();
165	sdp_count--;
166	LIST_REMOVE(ssk, list);
167	SDP_LIST_WUNLOCK();
168	crfree(ssk->cred);
169	sdp_destroy_cma(ssk);
170	ssk->qp_active = 0;
171	if (ssk->qp) {
172		ib_destroy_qp(ssk->qp);
173		ssk->qp = NULL;
174	}
175	sdp_tx_ring_destroy(ssk);
176	sdp_rx_ring_destroy(ssk);
177	rw_destroy(&ssk->rx_ring.destroyed_lock);
178	uma_zfree(sdp_zone, ssk);
179	rw_destroy(&ssk->lock);
180}
181
182/*
183 * Common routines to return a socket address.
184 */
185static struct sockaddr *
186sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
187{
188	struct sockaddr_in *sin;
189
190	sin = malloc(sizeof *sin, M_SONAME,
191		M_WAITOK | M_ZERO);
192	sin->sin_family = AF_INET;
193	sin->sin_len = sizeof(*sin);
194	sin->sin_addr = *addr_p;
195	sin->sin_port = port;
196
197	return (struct sockaddr *)sin;
198}
199
200static int
201sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
202{
203	struct sdp_sock *ssk;
204	struct in_addr addr;
205	in_port_t port;
206
207	ssk = sdp_sk(so);
208	SDP_RLOCK(ssk);
209	port = ssk->lport;
210	addr.s_addr = ssk->laddr;
211	SDP_RUNLOCK(ssk);
212
213	*nam = sdp_sockaddr(port, &addr);
214	return 0;
215}
216
217static int
218sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
219{
220	struct sdp_sock *ssk;
221	struct in_addr addr;
222	in_port_t port;
223
224	ssk = sdp_sk(so);
225	SDP_RLOCK(ssk);
226	port = ssk->fport;
227	addr.s_addr = ssk->faddr;
228	SDP_RUNLOCK(ssk);
229
230	*nam = sdp_sockaddr(port, &addr);
231	return 0;
232}
233
234static void
235sdp_pcbnotifyall(struct in_addr faddr, int errno,
236    struct sdp_sock *(*notify)(struct sdp_sock *, int))
237{
238	struct sdp_sock *ssk, *ssk_temp;
239
240	SDP_LIST_WLOCK();
241	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
242		SDP_WLOCK(ssk);
243		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
244			SDP_WUNLOCK(ssk);
245			continue;
246		}
247		if ((ssk->flags & SDP_DESTROY) == 0)
248			if ((*notify)(ssk, errno))
249				SDP_WUNLOCK(ssk);
250	}
251	SDP_LIST_WUNLOCK();
252}
253
254#if 0
255static void
256sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
257{
258	struct sdp_sock *ssk;
259
260	SDP_LIST_RLOCK();
261	LIST_FOREACH(ssk, &sdp_list, list) {
262		SDP_WLOCK(ssk);
263		func(ssk, arg);
264		SDP_WUNLOCK(ssk);
265	}
266	SDP_LIST_RUNLOCK();
267}
268#endif
269
270static void
271sdp_output_reset(struct sdp_sock *ssk)
272{
273	struct rdma_cm_id *id;
274
275	SDP_WLOCK_ASSERT(ssk);
276	if (ssk->id) {
277		id = ssk->id;
278		ssk->qp_active = 0;
279		SDP_WUNLOCK(ssk);
280		rdma_disconnect(id);
281		SDP_WLOCK(ssk);
282	}
283	ssk->state = TCPS_CLOSED;
284}
285
286/*
287 * Attempt to close a SDP socket, marking it as dropped, and freeing
288 * the socket if we hold the only reference.
289 */
290static struct sdp_sock *
291sdp_closed(struct sdp_sock *ssk)
292{
293	struct socket *so;
294
295	SDP_WLOCK_ASSERT(ssk);
296
297	ssk->flags |= SDP_DROPPED;
298	so = ssk->socket;
299	soisdisconnected(so);
300	if (ssk->flags & SDP_SOCKREF) {
301		KASSERT(so->so_state & SS_PROTOREF,
302		    ("sdp_closed: !SS_PROTOREF"));
303		ssk->flags &= ~SDP_SOCKREF;
304		SDP_WUNLOCK(ssk);
305		ACCEPT_LOCK();
306		SOCK_LOCK(so);
307		so->so_state &= ~SS_PROTOREF;
308		sofree(so);
309		return (NULL);
310	}
311	return (ssk);
312}
313
314/*
315 * Perform timer based shutdowns which can not operate in
316 * callout context.
317 */
318static void
319sdp_shutdown_task(void *data, int pending)
320{
321	struct sdp_sock *ssk;
322
323	ssk = data;
324	SDP_WLOCK(ssk);
325	/*
326	 * I don't think this can race with another call to pcbfree()
327	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
328	 */
329	if (ssk->flags & SDP_DESTROY)
330		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
331		    ssk);
332	if (ssk->flags & SDP_DISCON)
333		sdp_output_reset(ssk);
334	/* We have to clear this so sdp_detach() will call pcbfree(). */
335	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
336	if ((ssk->flags & SDP_DROPPED) == 0 &&
337	    sdp_closed(ssk) == NULL)
338		return;
339	if (ssk->socket == NULL) {
340		sdp_pcbfree(ssk);
341		return;
342	}
343	SDP_WUNLOCK(ssk);
344}
345
346/*
347 * 2msl has expired, schedule the shutdown task.
348 */
349static void
350sdp_2msl_timeout(void *data)
351{
352	struct sdp_sock *ssk;
353
354	ssk = data;
355	/* Callout canceled. */
356        if (!callout_active(&ssk->keep2msl))
357		goto out;
358        callout_deactivate(&ssk->keep2msl);
359	/* Should be impossible, defensive programming. */
360	if ((ssk->flags & SDP_TIMEWAIT) == 0)
361		goto out;
362	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
363out:
364	SDP_WUNLOCK(ssk);
365	return;
366}
367
368/*
369 * Schedule the 2msl wait timer.
370 */
371static void
372sdp_2msl_wait(struct sdp_sock *ssk)
373{
374
375	SDP_WLOCK_ASSERT(ssk);
376	ssk->flags |= SDP_TIMEWAIT;
377	ssk->state = TCPS_TIME_WAIT;
378	soisdisconnected(ssk->socket);
379	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
380}
381
382/*
383 * Timed out waiting for the final fin/ack from rdma_disconnect().
384 */
385static void
386sdp_dreq_timeout(void *data)
387{
388	struct sdp_sock *ssk;
389
390	ssk = data;
391	/* Callout canceled. */
392        if (!callout_active(&ssk->keep2msl))
393		goto out;
394	/* Callout rescheduled, probably as a different timer. */
395	if (callout_pending(&ssk->keep2msl))
396		goto out;
397        callout_deactivate(&ssk->keep2msl);
398	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
399		goto out;
400	if ((ssk->flags & SDP_DREQWAIT) == 0)
401		goto out;
402	ssk->flags &= ~SDP_DREQWAIT;
403	ssk->flags |= SDP_DISCON;
404	sdp_2msl_wait(ssk);
405	ssk->qp_active = 0;
406out:
407	SDP_WUNLOCK(ssk);
408}
409
410/*
411 * Received the final fin/ack.  Cancel the 2msl.
412 */
413void
414sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
415{
416	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
417	ssk->flags &= ~SDP_DREQWAIT;
418	sdp_2msl_wait(ssk);
419}
420
421static int
422sdp_init_sock(struct socket *sk)
423{
424	struct sdp_sock *ssk = sdp_sk(sk);
425
426	sdp_dbg(sk, "%s\n", __func__);
427
428	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
429	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
430#ifdef SDP_ZCOPY
431	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
432	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
433	ssk->tx_ring.rdma_inflight = NULL;
434#endif
435	atomic_set(&ssk->mseq_ack, 0);
436	sdp_rx_ring_init(ssk);
437	ssk->tx_ring.buffer = NULL;
438
439	return 0;
440}
441
442/*
443 * Allocate an sdp_sock for the socket and reserve socket buffer space.
444 */
445static int
446sdp_attach(struct socket *so, int proto, struct thread *td)
447{
448	struct sdp_sock *ssk;
449	int error;
450
451	ssk = sdp_sk(so);
452	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
453	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
454		error = soreserve(so, sdp_sendspace, sdp_recvspace);
455		if (error)
456			return (error);
457	}
458	so->so_rcv.sb_flags |= SB_AUTOSIZE;
459	so->so_snd.sb_flags |= SB_AUTOSIZE;
460	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
461	if (ssk == NULL)
462		return (ENOBUFS);
463	rw_init(&ssk->lock, "sdpsock");
464	ssk->socket = so;
465	ssk->cred = crhold(so->so_cred);
466	so->so_pcb = (caddr_t)ssk;
467	sdp_init_sock(so);
468	ssk->flags = 0;
469	ssk->qp_active = 0;
470	ssk->state = TCPS_CLOSED;
471	SDP_LIST_WLOCK();
472	LIST_INSERT_HEAD(&sdp_list, ssk, list);
473	sdp_count++;
474	SDP_LIST_WUNLOCK();
475	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
476		so->so_linger = TCP_LINGERTIME;
477
478	return (0);
479}
480
481/*
482 * Detach SDP from the socket, potentially leaving it around for the
483 * timewait to expire.
484 */
485static void
486sdp_detach(struct socket *so)
487{
488	struct sdp_sock *ssk;
489
490	ssk = sdp_sk(so);
491	SDP_WLOCK(ssk);
492	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
493	ssk->socket->so_pcb = NULL;
494	ssk->socket = NULL;
495	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
496		SDP_WUNLOCK(ssk);
497	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
498		sdp_pcbfree(ssk);
499	else
500		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
501}
502
503/*
504 * Allocate a local address for the socket.
505 */
506static int
507sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
508{
509	int error = 0;
510	struct sdp_sock *ssk;
511	struct sockaddr_in *sin;
512
513	sin = (struct sockaddr_in *)nam;
514	if (nam->sa_len != sizeof (*sin))
515		return (EINVAL);
516	if (sin->sin_family != AF_INET)
517		return (EINVAL);
518	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
519		return (EAFNOSUPPORT);
520
521	ssk = sdp_sk(so);
522	SDP_WLOCK(ssk);
523	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
524		error = EINVAL;
525		goto out;
526	}
527	error = sdp_pcbbind(ssk, nam, td->td_ucred);
528out:
529	SDP_WUNLOCK(ssk);
530
531	return (error);
532}
533
534/*
535 * Prepare to accept connections.
536 */
537static int
538sdp_listen(struct socket *so, int backlog, struct thread *td)
539{
540	int error = 0;
541	struct sdp_sock *ssk;
542
543	ssk = sdp_sk(so);
544	SDP_WLOCK(ssk);
545	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
546		error = EINVAL;
547		goto out;
548	}
549	if (error == 0 && ssk->lport == 0)
550		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
551	SOCK_LOCK(so);
552	if (error == 0)
553		error = solisten_proto_check(so);
554	if (error == 0) {
555		solisten_proto(so, backlog);
556		ssk->state = TCPS_LISTEN;
557	}
558	SOCK_UNLOCK(so);
559
560out:
561	SDP_WUNLOCK(ssk);
562	if (error == 0)
563		error = -rdma_listen(ssk->id, backlog);
564	return (error);
565}
566
567/*
568 * Initiate a SDP connection to nam.
569 */
570static int
571sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
572{
573	struct sockaddr_in src;
574	struct socket *so;
575	int error;
576
577	so = ssk->socket;
578
579	SDP_WLOCK_ASSERT(ssk);
580	if (ssk->lport == 0) {
581		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
582		if (error)
583			return error;
584	}
585	src.sin_family = AF_INET;
586	src.sin_len = sizeof(src);
587	bzero(&src.sin_zero, sizeof(src.sin_zero));
588	src.sin_port = ssk->lport;
589	src.sin_addr.s_addr = ssk->laddr;
590	soisconnecting(so);
591	SDP_WUNLOCK(ssk);
592	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
593	    SDP_RESOLVE_TIMEOUT);
594	SDP_WLOCK(ssk);
595	if (error == 0)
596		ssk->state = TCPS_SYN_SENT;
597
598	return 0;
599}
600
601/*
602 * Initiate SDP connection.
603 */
604static int
605sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
606{
607	int error = 0;
608	struct sdp_sock *ssk;
609	struct sockaddr_in *sin;
610
611	sin = (struct sockaddr_in *)nam;
612	if (nam->sa_len != sizeof (*sin))
613		return (EINVAL);
614	if (sin->sin_family != AF_INET)
615		return (EINVAL);
616	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
617		return (EAFNOSUPPORT);
618	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
619		return (error);
620	ssk = sdp_sk(so);
621	SDP_WLOCK(ssk);
622	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
623		error = EINVAL;
624	else
625		error = sdp_start_connect(ssk, nam, td);
626	SDP_WUNLOCK(ssk);
627	return (error);
628}
629
630/*
631 * Drop a SDP socket, reporting
632 * the specified error.  If connection is synchronized,
633 * then send a RST to peer.
634 */
635static struct sdp_sock *
636sdp_drop(struct sdp_sock *ssk, int errno)
637{
638	struct socket *so;
639
640	SDP_WLOCK_ASSERT(ssk);
641	so = ssk->socket;
642	if (TCPS_HAVERCVDSYN(ssk->state))
643		sdp_output_reset(ssk);
644	if (errno == ETIMEDOUT && ssk->softerror)
645		errno = ssk->softerror;
646	so->so_error = errno;
647	return (sdp_closed(ssk));
648}
649
650/*
651 * User issued close, and wish to trail through shutdown states:
652 * if never received SYN, just forget it.  If got a SYN from peer,
653 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
654 * If already got a FIN from peer, then almost done; go to LAST_ACK
655 * state.  In all other cases, have already sent FIN to peer (e.g.
656 * after PRU_SHUTDOWN), and just have to play tedious game waiting
657 * for peer to send FIN or not respond to keep-alives, etc.
658 * We can let the user exit from the close as soon as the FIN is acked.
659 */
660static void
661sdp_usrclosed(struct sdp_sock *ssk)
662{
663
664	SDP_WLOCK_ASSERT(ssk);
665
666	switch (ssk->state) {
667	case TCPS_LISTEN:
668		ssk->state = TCPS_CLOSED;
669		SDP_WUNLOCK(ssk);
670		sdp_destroy_cma(ssk);
671		SDP_WLOCK(ssk);
672		/* FALLTHROUGH */
673	case TCPS_CLOSED:
674		ssk = sdp_closed(ssk);
675		/*
676		 * sdp_closed() should never return NULL here as the socket is
677		 * still open.
678		 */
679		KASSERT(ssk != NULL,
680		    ("sdp_usrclosed: sdp_closed() returned NULL"));
681		break;
682
683	case TCPS_SYN_SENT:
684		/* FALLTHROUGH */
685	case TCPS_SYN_RECEIVED:
686		ssk->flags |= SDP_NEEDFIN;
687		break;
688
689	case TCPS_ESTABLISHED:
690		ssk->flags |= SDP_NEEDFIN;
691		ssk->state = TCPS_FIN_WAIT_1;
692		break;
693
694	case TCPS_CLOSE_WAIT:
695		ssk->state = TCPS_LAST_ACK;
696		break;
697	}
698	if (ssk->state >= TCPS_FIN_WAIT_2) {
699		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
700		if (ssk->state == TCPS_FIN_WAIT_2)
701			sdp_2msl_wait(ssk);
702		else
703			soisdisconnected(ssk->socket);
704	}
705}
706
707static void
708sdp_output_disconnect(struct sdp_sock *ssk)
709{
710
711	SDP_WLOCK_ASSERT(ssk);
712	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
713	    sdp_dreq_timeout, ssk);
714	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
715	sdp_post_sends(ssk, M_NOWAIT);
716}
717
718/*
719 * Initiate or continue a disconnect.
720 * If embryonic state, just send reset (once).
721 * If in ``let data drain'' option and linger null, just drop.
722 * Otherwise (hard), mark socket disconnecting and drop
723 * current input data; switch states based on user close, and
724 * send segment to peer (with FIN).
725 */
726static void
727sdp_start_disconnect(struct sdp_sock *ssk)
728{
729	struct socket *so;
730	int unread;
731
732	so = ssk->socket;
733	SDP_WLOCK_ASSERT(ssk);
734	sdp_stop_keepalive_timer(so);
735	/*
736	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
737	 * socket is still open.
738	 */
739	if (ssk->state < TCPS_ESTABLISHED) {
740		ssk = sdp_closed(ssk);
741		KASSERT(ssk != NULL,
742		    ("sdp_start_disconnect: sdp_close() returned NULL"));
743	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
744		ssk = sdp_drop(ssk, 0);
745		KASSERT(ssk != NULL,
746		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
747	} else {
748		soisdisconnecting(so);
749		unread = so->so_rcv.sb_cc;
750		sbflush(&so->so_rcv);
751		sdp_usrclosed(ssk);
752		if (!(ssk->flags & SDP_DROPPED)) {
753			if (unread)
754				sdp_output_reset(ssk);
755			else
756				sdp_output_disconnect(ssk);
757		}
758	}
759}
760
761/*
762 * User initiated disconnect.
763 */
764static int
765sdp_disconnect(struct socket *so)
766{
767	struct sdp_sock *ssk;
768	int error = 0;
769
770	ssk = sdp_sk(so);
771	SDP_WLOCK(ssk);
772	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
773		error = ECONNRESET;
774		goto out;
775	}
776	sdp_start_disconnect(ssk);
777out:
778	SDP_WUNLOCK(ssk);
779	return (error);
780}
781
782/*
783 * Accept a connection.  Essentially all the work is done at higher levels;
784 * just return the address of the peer, storing through addr.
785 *
786 *
787 * XXX This is broken XXX
788 *
789 * The rationale for acquiring the sdp lock here is somewhat complicated,
790 * and is described in detail in the commit log entry for r175612.  Acquiring
791 * it delays an accept(2) racing with sonewconn(), which inserts the socket
792 * before the address/port fields are initialized.  A better fix would
793 * prevent the socket from being placed in the listen queue until all fields
794 * are fully initialized.
795 */
796static int
797sdp_accept(struct socket *so, struct sockaddr **nam)
798{
799	struct sdp_sock *ssk = NULL;
800	struct in_addr addr;
801	in_port_t port;
802	int error;
803
804	if (so->so_state & SS_ISDISCONNECTED)
805		return (ECONNABORTED);
806
807	port = 0;
808	addr.s_addr = 0;
809	error = 0;
810	ssk = sdp_sk(so);
811	SDP_WLOCK(ssk);
812	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
813		error = ECONNABORTED;
814		goto out;
815	}
816	port = ssk->fport;
817	addr.s_addr = ssk->faddr;
818out:
819	SDP_WUNLOCK(ssk);
820	if (error == 0)
821		*nam = sdp_sockaddr(port, &addr);
822	return error;
823}
824
825/*
826 * Mark the connection as being incapable of further output.
827 */
828static int
829sdp_shutdown(struct socket *so)
830{
831	int error = 0;
832	struct sdp_sock *ssk;
833
834	ssk = sdp_sk(so);
835	SDP_WLOCK(ssk);
836	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
837		error = ECONNRESET;
838		goto out;
839	}
840	socantsendmore(so);
841	sdp_usrclosed(ssk);
842	if (!(ssk->flags & SDP_DROPPED))
843		sdp_output_disconnect(ssk);
844
845out:
846	SDP_WUNLOCK(ssk);
847
848	return (error);
849}
850
851static void
852sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
853{
854	struct mbuf *n;
855	int ncnt;
856
857	SOCKBUF_LOCK_ASSERT(sb);
858	SBLASTRECORDCHK(sb);
859	KASSERT(mb->m_flags & M_PKTHDR,
860		("sdp_append: %p Missing packet header.\n", mb));
861	n = sb->sb_lastrecord;
862	/*
863	 * If the queue is empty just set all pointers and proceed.
864	 */
865	if (n == NULL) {
866		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
867		for (; mb; mb = mb->m_next) {
868	                sb->sb_mbtail = mb;
869			sballoc(sb, mb);
870		}
871		return;
872	}
873	/*
874	 * Count the number of mbufs in the current tail.
875	 */
876	for (ncnt = 0; n->m_next; n = n->m_next)
877		ncnt++;
878	n = sb->sb_lastrecord;
879	/*
880	 * If the two chains can fit in a single sdp packet and
881	 * the last record has not been sent yet (WRITABLE) coalesce
882	 * them.  The lastrecord remains the same but we must strip the
883	 * packet header and then let sbcompress do the hard part.
884	 */
885	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
886	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
887	    ssk->xmit_size_goal) {
888		m_adj(mb, SDP_HEAD_SIZE);
889		n->m_pkthdr.len += mb->m_pkthdr.len;
890		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
891		m_demote(mb, 1);
892		sbcompress(sb, mb, sb->sb_mbtail);
893		return;
894	}
895	/*
896	 * Not compressible, just append to the end and adjust counters.
897	 */
898	sb->sb_lastrecord->m_flags |= M_PUSH;
899	sb->sb_lastrecord->m_nextpkt = mb;
900	sb->sb_lastrecord = mb;
901	if (sb->sb_sndptr == NULL)
902		sb->sb_sndptr = mb;
903	for (; mb; mb = mb->m_next) {
904		sb->sb_mbtail = mb;
905		sballoc(sb, mb);
906	}
907}
908
909/*
910 * Do a send by putting data in output queue and updating urgent
911 * marker if URG set.  Possibly send more data.  Unlike the other
912 * pru_*() routines, the mbuf chains are our responsibility.  We
913 * must either enqueue them or free them.  The other pru_* routines
914 * generally are caller-frees.
915 *
916 * This comes from sendfile, normal sends will come from sdp_sosend().
917 */
918static int
919sdp_send(struct socket *so, int flags, struct mbuf *m,
920    struct sockaddr *nam, struct mbuf *control, struct thread *td)
921{
922	struct sdp_sock *ssk;
923	struct mbuf *n;
924	int error;
925	int cnt;
926
927	error = 0;
928	ssk = sdp_sk(so);
929	KASSERT(m->m_flags & M_PKTHDR,
930	    ("sdp_send: %p no packet header", m));
931	M_PREPEND(m, SDP_HEAD_SIZE, M_WAIT);
932	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
933	for (n = m, cnt = 0; n->m_next; n = n->m_next)
934		cnt++;
935	if (cnt > SDP_MAX_SEND_SGES) {
936		n = m_collapse(m, M_WAIT, SDP_MAX_SEND_SGES);
937		if (n == NULL) {
938			m_freem(m);
939			return (EMSGSIZE);
940		}
941		m = n;
942		for (cnt = 0; n->m_next; n = n->m_next)
943			cnt++;
944	}
945	SDP_WLOCK(ssk);
946	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
947		if (control)
948			m_freem(control);
949		if (m)
950			m_freem(m);
951		error = ECONNRESET;
952		goto out;
953	}
954	if (control) {
955		/* SDP doesn't support control messages. */
956		if (control->m_len) {
957			m_freem(control);
958			if (m)
959				m_freem(m);
960			error = EINVAL;
961			goto out;
962		}
963		m_freem(control);	/* empty control, just free it */
964	}
965	if (!(flags & PRUS_OOB)) {
966		SOCKBUF_LOCK(&so->so_snd);
967		sdp_append(ssk, &so->so_snd, m, cnt);
968		SOCKBUF_UNLOCK(&so->so_snd);
969		if (nam && ssk->state < TCPS_SYN_SENT) {
970			/*
971			 * Do implied connect if not yet connected.
972			 */
973			error = sdp_start_connect(ssk, nam, td);
974			if (error)
975				goto out;
976		}
977		if (flags & PRUS_EOF) {
978			/*
979			 * Close the send side of the connection after
980			 * the data is sent.
981			 */
982			socantsendmore(so);
983			sdp_usrclosed(ssk);
984			if (!(ssk->flags & SDP_DROPPED))
985				sdp_output_disconnect(ssk);
986		} else if (!(ssk->flags & SDP_DROPPED) &&
987		    !(flags & PRUS_MORETOCOME))
988			sdp_post_sends(ssk, M_NOWAIT);
989		SDP_WUNLOCK(ssk);
990		return (0);
991	} else {
992		SOCKBUF_LOCK(&so->so_snd);
993		if (sbspace(&so->so_snd) < -512) {
994			SOCKBUF_UNLOCK(&so->so_snd);
995			m_freem(m);
996			error = ENOBUFS;
997			goto out;
998		}
999		/*
1000		 * According to RFC961 (Assigned Protocols),
1001		 * the urgent pointer points to the last octet
1002		 * of urgent data.  We continue, however,
1003		 * to consider it to indicate the first octet
1004		 * of data past the urgent section.
1005		 * Otherwise, snd_up should be one lower.
1006		 */
1007		m->m_flags |= M_URG | M_PUSH;
1008		sdp_append(ssk, &so->so_snd, m, cnt);
1009		SOCKBUF_UNLOCK(&so->so_snd);
1010		if (nam && ssk->state < TCPS_SYN_SENT) {
1011			/*
1012			 * Do implied connect if not yet connected.
1013			 */
1014			error = sdp_start_connect(ssk, nam, td);
1015			if (error)
1016				goto out;
1017		}
1018		sdp_post_sends(ssk, M_NOWAIT);
1019		SDP_WUNLOCK(ssk);
1020		return (0);
1021	}
1022out:
1023	SDP_WUNLOCK(ssk);
1024	return (error);
1025}
1026
1027#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1028
1029/*
1030 * Send on a socket.  If send must go all at once and message is larger than
1031 * send buffering, then hard error.  Lock against other senders.  If must go
1032 * all at once and not enough room now, then inform user that this would
1033 * block and do nothing.  Otherwise, if nonblocking, send as much as
1034 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1035 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1036 * in mbuf chain must be small enough to send all at once.
1037 *
1038 * Returns nonzero on error, timeout or signal; callers must check for short
1039 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1040 * on return.
1041 */
1042static int
1043sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1044    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1045{
1046	struct sdp_sock *ssk;
1047	long space, resid;
1048	int atomic;
1049	int error;
1050	int copy;
1051
1052	if (uio != NULL)
1053		resid = uio->uio_resid;
1054	else
1055		resid = top->m_pkthdr.len;
1056	atomic = top != NULL;
1057	if (control != NULL) {
1058		if (control->m_len) {
1059			m_freem(control);
1060			if (top)
1061				m_freem(top);
1062			return (EINVAL);
1063		}
1064		m_freem(control);
1065		control = NULL;
1066	}
1067	/*
1068	 * In theory resid should be unsigned.  However, space must be
1069	 * signed, as it might be less than 0 if we over-committed, and we
1070	 * must use a signed comparison of space and resid.  On the other
1071	 * hand, a negative resid causes us to loop sending 0-length
1072	 * segments to the protocol.
1073	 *
1074	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1075	 * type sockets since that's an error.
1076	 */
1077	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1078		error = EINVAL;
1079		goto out;
1080	}
1081	if (td != NULL)
1082		td->td_ru.ru_msgsnd++;
1083
1084	ssk = sdp_sk(so);
1085	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1086	if (error)
1087		goto out;
1088
1089restart:
1090	do {
1091		SOCKBUF_LOCK(&so->so_snd);
1092		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1093			SOCKBUF_UNLOCK(&so->so_snd);
1094			error = EPIPE;
1095			goto release;
1096		}
1097		if (so->so_error) {
1098			error = so->so_error;
1099			so->so_error = 0;
1100			SOCKBUF_UNLOCK(&so->so_snd);
1101			goto release;
1102		}
1103		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1104			SOCKBUF_UNLOCK(&so->so_snd);
1105			error = ENOTCONN;
1106			goto release;
1107		}
1108		space = sbspace(&so->so_snd);
1109		if (flags & MSG_OOB)
1110			space += 1024;
1111		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1112			SOCKBUF_UNLOCK(&so->so_snd);
1113			error = EMSGSIZE;
1114			goto release;
1115		}
1116		if (space < resid &&
1117		    (atomic || space < so->so_snd.sb_lowat)) {
1118			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1119				SOCKBUF_UNLOCK(&so->so_snd);
1120				error = EWOULDBLOCK;
1121				goto release;
1122			}
1123			error = sbwait(&so->so_snd);
1124			SOCKBUF_UNLOCK(&so->so_snd);
1125			if (error)
1126				goto release;
1127			goto restart;
1128		}
1129		SOCKBUF_UNLOCK(&so->so_snd);
1130		do {
1131			if (uio == NULL) {
1132				resid = 0;
1133				if (flags & MSG_EOR)
1134					top->m_flags |= M_EOR;
1135			} else {
1136				/*
1137				 * Copy the data from userland into a mbuf
1138				 * chain.  If no data is to be copied in,
1139				 * a single empty mbuf is returned.
1140				 */
1141				copy = min(space,
1142				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1143				top = m_uiotombuf(uio, M_WAITOK, copy,
1144				    0, M_PKTHDR |
1145				    ((flags & MSG_EOR) ? M_EOR : 0));
1146				if (top == NULL) {
1147					/* only possible error */
1148					error = EFAULT;
1149					goto release;
1150				}
1151				space -= resid - uio->uio_resid;
1152				resid = uio->uio_resid;
1153			}
1154			/*
1155			 * XXX all the SBS_CANTSENDMORE checks previously
1156			 * done could be out of date after dropping the
1157			 * socket lock.
1158			 */
1159			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1160			/*
1161			 * Set EOF on the last send if the user specified
1162			 * MSG_EOF.
1163			 */
1164			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1165			/* If there is more to send set PRUS_MORETOCOME. */
1166			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1167			    top, addr, NULL, td);
1168			top = NULL;
1169			if (error)
1170				goto release;
1171		} while (resid && space > 0);
1172	} while (resid);
1173
1174release:
1175	sbunlock(&so->so_snd);
1176out:
1177	if (top != NULL)
1178		m_freem(top);
1179	return (error);
1180}
1181
1182/*
1183 * The part of soreceive() that implements reading non-inline out-of-band
1184 * data from a socket.  For more complete comments, see soreceive(), from
1185 * which this code originated.
1186 *
1187 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1188 * unable to return an mbuf chain to the caller.
1189 */
1190static int
1191soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1192{
1193	struct protosw *pr = so->so_proto;
1194	struct mbuf *m;
1195	int error;
1196
1197	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1198
1199	m = m_get(M_WAIT, MT_DATA);
1200	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1201	if (error)
1202		goto bad;
1203	do {
1204		error = uiomove(mtod(m, void *),
1205		    (int) min(uio->uio_resid, m->m_len), uio);
1206		m = m_free(m);
1207	} while (uio->uio_resid && error == 0 && m);
1208bad:
1209	if (m != NULL)
1210		m_freem(m);
1211	return (error);
1212}
1213
1214/*
1215 * Optimized version of soreceive() for stream (TCP) sockets.
1216 */
1217static int
1218sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1219    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1220{
1221	int len = 0, error = 0, flags, oresid;
1222	struct sockbuf *sb;
1223	struct mbuf *m, *n = NULL;
1224	struct sdp_sock *ssk;
1225
1226	/* We only do stream sockets. */
1227	if (so->so_type != SOCK_STREAM)
1228		return (EINVAL);
1229	if (psa != NULL)
1230		*psa = NULL;
1231	if (controlp != NULL)
1232		return (EINVAL);
1233	if (flagsp != NULL)
1234		flags = *flagsp &~ MSG_EOR;
1235	else
1236		flags = 0;
1237	if (flags & MSG_OOB)
1238		return (soreceive_rcvoob(so, uio, flags));
1239	if (mp0 != NULL)
1240		*mp0 = NULL;
1241
1242	sb = &so->so_rcv;
1243	ssk = sdp_sk(so);
1244
1245	/* Prevent other readers from entering the socket. */
1246	error = sblock(sb, SBLOCKWAIT(flags));
1247	if (error)
1248		goto out;
1249	SOCKBUF_LOCK(sb);
1250
1251	/* Easy one, no space to copyout anything. */
1252	if (uio->uio_resid == 0) {
1253		error = EINVAL;
1254		goto out;
1255	}
1256	oresid = uio->uio_resid;
1257
1258	/* We will never ever get anything unless we are connected. */
1259	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1260		/* When disconnecting there may be still some data left. */
1261		if (sb->sb_cc > 0)
1262			goto deliver;
1263		if (!(so->so_state & SS_ISDISCONNECTED))
1264			error = ENOTCONN;
1265		goto out;
1266	}
1267
1268	/* Socket buffer is empty and we shall not block. */
1269	if (sb->sb_cc == 0 &&
1270	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1271		error = EAGAIN;
1272		goto out;
1273	}
1274
1275restart:
1276	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1277
1278	/* Abort if socket has reported problems. */
1279	if (so->so_error) {
1280		if (sb->sb_cc > 0)
1281			goto deliver;
1282		if (oresid > uio->uio_resid)
1283			goto out;
1284		error = so->so_error;
1285		if (!(flags & MSG_PEEK))
1286			so->so_error = 0;
1287		goto out;
1288	}
1289
1290	/* Door is closed.  Deliver what is left, if any. */
1291	if (sb->sb_state & SBS_CANTRCVMORE) {
1292		if (sb->sb_cc > 0)
1293			goto deliver;
1294		else
1295			goto out;
1296	}
1297
1298	/* Socket buffer got some data that we shall deliver now. */
1299	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1300	    ((so->so_state & SS_NBIO) ||
1301	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1302	     sb->sb_cc >= sb->sb_lowat ||
1303	     sb->sb_cc >= uio->uio_resid ||
1304	     sb->sb_cc >= sb->sb_hiwat) ) {
1305		goto deliver;
1306	}
1307
1308	/* On MSG_WAITALL we must wait until all data or error arrives. */
1309	if ((flags & MSG_WAITALL) &&
1310	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1311		goto deliver;
1312
1313	/*
1314	 * Wait and block until (more) data comes in.
1315	 * NB: Drops the sockbuf lock during wait.
1316	 */
1317	error = sbwait(sb);
1318	if (error)
1319		goto out;
1320	goto restart;
1321
1322deliver:
1323	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1324	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1325	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1326
1327	/* Statistics. */
1328	if (uio->uio_td)
1329		uio->uio_td->td_ru.ru_msgrcv++;
1330
1331	/* Fill uio until full or current end of socket buffer is reached. */
1332	len = min(uio->uio_resid, sb->sb_cc);
1333	if (mp0 != NULL) {
1334		/* Dequeue as many mbufs as possible. */
1335		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1336			for (*mp0 = m = sb->sb_mb;
1337			     m != NULL && m->m_len <= len;
1338			     m = m->m_next) {
1339				len -= m->m_len;
1340				uio->uio_resid -= m->m_len;
1341				sbfree(sb, m);
1342				n = m;
1343			}
1344			sb->sb_mb = m;
1345			if (sb->sb_mb == NULL)
1346				SB_EMPTY_FIXUP(sb);
1347			n->m_next = NULL;
1348		}
1349		/* Copy the remainder. */
1350		if (len > 0) {
1351			KASSERT(sb->sb_mb != NULL,
1352			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1353
1354			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
1355			if (m == NULL)
1356				len = 0;	/* Don't flush data from sockbuf. */
1357			else
1358				uio->uio_resid -= m->m_len;
1359			if (*mp0 != NULL)
1360				n->m_next = m;
1361			else
1362				*mp0 = m;
1363			if (*mp0 == NULL) {
1364				error = ENOBUFS;
1365				goto out;
1366			}
1367		}
1368	} else {
1369		/* NB: Must unlock socket buffer as uiomove may sleep. */
1370		SOCKBUF_UNLOCK(sb);
1371		error = m_mbuftouio(uio, sb->sb_mb, len);
1372		SOCKBUF_LOCK(sb);
1373		if (error)
1374			goto out;
1375	}
1376	SBLASTRECORDCHK(sb);
1377	SBLASTMBUFCHK(sb);
1378
1379	/*
1380	 * Remove the delivered data from the socket buffer unless we
1381	 * were only peeking.
1382	 */
1383	if (!(flags & MSG_PEEK)) {
1384		if (len > 0)
1385			sbdrop_locked(sb, len);
1386
1387		/* Notify protocol that we drained some data. */
1388		SOCKBUF_UNLOCK(sb);
1389		SDP_WLOCK(ssk);
1390		sdp_do_posts(ssk);
1391		SDP_WUNLOCK(ssk);
1392		SOCKBUF_LOCK(sb);
1393	}
1394
1395	/*
1396	 * For MSG_WAITALL we may have to loop again and wait for
1397	 * more data to come in.
1398	 */
1399	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1400		goto restart;
1401out:
1402	SOCKBUF_LOCK_ASSERT(sb);
1403	SBLASTRECORDCHK(sb);
1404	SBLASTMBUFCHK(sb);
1405	SOCKBUF_UNLOCK(sb);
1406	sbunlock(sb);
1407	return (error);
1408}
1409
1410/*
1411 * Abort is used to teardown a connection typically while sitting in
1412 * the accept queue.
1413 */
1414void
1415sdp_abort(struct socket *so)
1416{
1417	struct sdp_sock *ssk;
1418
1419	ssk = sdp_sk(so);
1420	SDP_WLOCK(ssk);
1421	/*
1422	 * If we have not yet dropped, do it now.
1423	 */
1424	if (!(ssk->flags & SDP_TIMEWAIT) &&
1425	    !(ssk->flags & SDP_DROPPED))
1426		sdp_drop(ssk, ECONNABORTED);
1427	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1428	    ssk, ssk->flags));
1429	SDP_WUNLOCK(ssk);
1430}
1431
1432/*
1433 * Close a SDP socket and initiate a friendly disconnect.
1434 */
1435static void
1436sdp_close(struct socket *so)
1437{
1438	struct sdp_sock *ssk;
1439
1440	ssk = sdp_sk(so);
1441	SDP_WLOCK(ssk);
1442	/*
1443	 * If we have not yet dropped, do it now.
1444	 */
1445	if (!(ssk->flags & SDP_TIMEWAIT) &&
1446	    !(ssk->flags & SDP_DROPPED))
1447		sdp_start_disconnect(ssk);
1448
1449	/*
1450	 * If we've still not dropped let the socket layer know we're
1451	 * holding on to the socket and pcb for a while.
1452	 */
1453	if (!(ssk->flags & SDP_DROPPED)) {
1454		SOCK_LOCK(so);
1455		so->so_state |= SS_PROTOREF;
1456		SOCK_UNLOCK(so);
1457		ssk->flags |= SDP_SOCKREF;
1458	}
1459	SDP_WUNLOCK(ssk);
1460}
1461
1462/*
1463 * User requests out-of-band data.
1464 */
1465static int
1466sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1467{
1468	int error = 0;
1469	struct sdp_sock *ssk;
1470
1471	ssk = sdp_sk(so);
1472	SDP_WLOCK(ssk);
1473	if (!rx_ring_trylock(&ssk->rx_ring)) {
1474		SDP_WUNLOCK(ssk);
1475		return (ECONNRESET);
1476	}
1477	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1478		error = ECONNRESET;
1479		goto out;
1480	}
1481	if ((so->so_oobmark == 0 &&
1482	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1483	    so->so_options & SO_OOBINLINE ||
1484	    ssk->oobflags & SDP_HADOOB) {
1485		error = EINVAL;
1486		goto out;
1487	}
1488	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1489		error = EWOULDBLOCK;
1490		goto out;
1491	}
1492	m->m_len = 1;
1493	*mtod(m, caddr_t) = ssk->iobc;
1494	if ((flags & MSG_PEEK) == 0)
1495		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1496out:
1497	rx_ring_unlock(&ssk->rx_ring);
1498	SDP_WUNLOCK(ssk);
1499	return (error);
1500}
1501
1502void
1503sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1504{
1505	struct mbuf *m;
1506	struct socket *so;
1507
1508	so = ssk->socket;
1509	if (so == NULL)
1510		return;
1511
1512	so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1;
1513	sohasoutofband(so);
1514	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1515	if (!(so->so_options & SO_OOBINLINE)) {
1516		for (m = mb; m->m_next != NULL; m = m->m_next);
1517		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1518		ssk->oobflags |= SDP_HAVEOOB;
1519		m->m_len--;
1520		mb->m_pkthdr.len--;
1521	}
1522}
1523
1524/*
1525 * Notify a sdp socket of an asynchronous error.
1526 *
1527 * Do not wake up user since there currently is no mechanism for
1528 * reporting soft errors (yet - a kqueue filter may be added).
1529 */
1530struct sdp_sock *
1531sdp_notify(struct sdp_sock *ssk, int error)
1532{
1533
1534	SDP_WLOCK_ASSERT(ssk);
1535
1536	if ((ssk->flags & SDP_TIMEWAIT) ||
1537	    (ssk->flags & SDP_DROPPED))
1538		return (ssk);
1539
1540	/*
1541	 * Ignore some errors if we are hooked up.
1542	 */
1543	if (ssk->state == TCPS_ESTABLISHED &&
1544	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1545	     error == EHOSTDOWN))
1546		return (ssk);
1547	ssk->softerror = error;
1548	return sdp_drop(ssk, error);
1549}
1550
1551static void
1552sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1553{
1554	struct in_addr faddr;
1555
1556	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1557	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1558		return;
1559
1560	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1561}
1562
1563static int
1564sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1565    struct thread *td)
1566{
1567	return (EOPNOTSUPP);
1568}
1569
1570static void
1571sdp_keepalive_timeout(void *data)
1572{
1573	struct sdp_sock *ssk;
1574
1575	ssk = data;
1576	/* Callout canceled. */
1577        if (!callout_active(&ssk->keep2msl))
1578                return;
1579	/* Callout rescheduled as a different kind of timer. */
1580	if (callout_pending(&ssk->keep2msl))
1581		goto out;
1582        callout_deactivate(&ssk->keep2msl);
1583	if (ssk->flags & SDP_DROPPED ||
1584	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1585		goto out;
1586	sdp_post_keepalive(ssk);
1587	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1588	    sdp_keepalive_timeout, ssk);
1589out:
1590	SDP_WUNLOCK(ssk);
1591}
1592
1593
1594void
1595sdp_start_keepalive_timer(struct socket *so)
1596{
1597	struct sdp_sock *ssk;
1598
1599	ssk = sdp_sk(so);
1600	if (!callout_pending(&ssk->keep2msl))
1601                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1602                    sdp_keepalive_timeout, ssk);
1603}
1604
1605static void
1606sdp_stop_keepalive_timer(struct socket *so)
1607{
1608	struct sdp_sock *ssk;
1609
1610	ssk = sdp_sk(so);
1611	callout_stop(&ssk->keep2msl);
1612}
1613
1614/*
1615 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1616 * socket option arguments.  When it re-acquires the lock after the copy, it
1617 * has to revalidate that the connection is still valid for the socket
1618 * option.
1619 */
1620#define SDP_WLOCK_RECHECK(inp) do {					\
1621	SDP_WLOCK(ssk);							\
1622	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1623		SDP_WUNLOCK(ssk);					\
1624		return (ECONNRESET);					\
1625	}								\
1626} while(0)
1627
1628static int
1629sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1630{
1631	int	error, opt, optval;
1632	struct sdp_sock *ssk;
1633
1634	error = 0;
1635	ssk = sdp_sk(so);
1636	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1637		SDP_WLOCK(ssk);
1638		if (so->so_options & SO_KEEPALIVE)
1639			sdp_start_keepalive_timer(so);
1640		else
1641			sdp_stop_keepalive_timer(so);
1642		SDP_WUNLOCK(ssk);
1643	}
1644	if (sopt->sopt_level != IPPROTO_TCP)
1645		return (error);
1646
1647	SDP_WLOCK(ssk);
1648	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1649		SDP_WUNLOCK(ssk);
1650		return (ECONNRESET);
1651	}
1652
1653	switch (sopt->sopt_dir) {
1654	case SOPT_SET:
1655		switch (sopt->sopt_name) {
1656		case TCP_NODELAY:
1657			SDP_WUNLOCK(ssk);
1658			error = sooptcopyin(sopt, &optval, sizeof optval,
1659			    sizeof optval);
1660			if (error)
1661				return (error);
1662
1663			SDP_WLOCK_RECHECK(ssk);
1664			opt = SDP_NODELAY;
1665			if (optval)
1666				ssk->flags |= opt;
1667			else
1668				ssk->flags &= ~opt;
1669			sdp_do_posts(ssk);
1670			SDP_WUNLOCK(ssk);
1671			break;
1672
1673		default:
1674			SDP_WUNLOCK(ssk);
1675			error = ENOPROTOOPT;
1676			break;
1677		}
1678		break;
1679
1680	case SOPT_GET:
1681		switch (sopt->sopt_name) {
1682		case TCP_NODELAY:
1683			optval = ssk->flags & SDP_NODELAY;
1684			SDP_WUNLOCK(ssk);
1685			error = sooptcopyout(sopt, &optval, sizeof optval);
1686			break;
1687		default:
1688			SDP_WUNLOCK(ssk);
1689			error = ENOPROTOOPT;
1690			break;
1691		}
1692		break;
1693	}
1694	return (error);
1695}
1696#undef SDP_WLOCK_RECHECK
1697
1698int sdp_mod_count = 0;
1699int sdp_mod_usec = 0;
1700
1701void
1702sdp_set_default_moderation(struct sdp_sock *ssk)
1703{
1704	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1705		return;
1706	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1707}
1708
1709
1710static void
1711sdp_dev_add(struct ib_device *device)
1712{
1713	struct ib_fmr_pool_param param;
1714	struct sdp_device *sdp_dev;
1715
1716	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1717	sdp_dev->pd = ib_alloc_pd(device);
1718	if (IS_ERR(sdp_dev->pd))
1719		goto out_pd;
1720        sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
1721        if (IS_ERR(sdp_dev->mr))
1722		goto out_mr;
1723	memset(&param, 0, sizeof param);
1724	param.max_pages_per_fmr = SDP_FMR_SIZE;
1725	param.page_shift = PAGE_SHIFT;
1726	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1727	param.pool_size = SDP_FMR_POOL_SIZE;
1728	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1729	param.cache = 1;
1730	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1731	if (IS_ERR(sdp_dev->fmr_pool))
1732		goto out_fmr;
1733	ib_set_client_data(device, &sdp_client, sdp_dev);
1734	return;
1735
1736out_fmr:
1737	ib_dereg_mr(sdp_dev->mr);
1738out_mr:
1739	ib_dealloc_pd(sdp_dev->pd);
1740out_pd:
1741	free(sdp_dev, M_SDP);
1742}
1743
1744static void
1745sdp_dev_rem(struct ib_device *device)
1746{
1747	struct sdp_device *sdp_dev;
1748	struct sdp_sock *ssk;
1749
1750	SDP_LIST_WLOCK();
1751	LIST_FOREACH(ssk, &sdp_list, list) {
1752		if (ssk->ib_device != device)
1753			continue;
1754		SDP_WLOCK(ssk);
1755		if ((ssk->flags & SDP_DESTROY) == 0)
1756			ssk = sdp_notify(ssk, ECONNRESET);
1757		if (ssk)
1758			SDP_WUNLOCK(ssk);
1759	}
1760	SDP_LIST_WUNLOCK();
1761	/*
1762	 * XXX Do I need to wait between these two?
1763	 */
1764	sdp_dev = ib_get_client_data(device, &sdp_client);
1765	if (!sdp_dev)
1766		return;
1767	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1768	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1769	ib_dereg_mr(sdp_dev->mr);
1770	ib_dealloc_pd(sdp_dev->pd);
1771	free(sdp_dev, M_SDP);
1772}
1773
1774struct ib_client sdp_client =
1775    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1776
1777
1778static int
1779sdp_pcblist(SYSCTL_HANDLER_ARGS)
1780{
1781	int error, n, i;
1782	struct sdp_sock *ssk;
1783	struct xinpgen xig;
1784
1785	/*
1786	 * The process of preparing the TCB list is too time-consuming and
1787	 * resource-intensive to repeat twice on every request.
1788	 */
1789	if (req->oldptr == NULL) {
1790		n = sdp_count;
1791		n += imax(n / 8, 10);
1792		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1793		return (0);
1794	}
1795
1796	if (req->newptr != NULL)
1797		return (EPERM);
1798
1799	/*
1800	 * OK, now we're committed to doing something.
1801	 */
1802	SDP_LIST_RLOCK();
1803	n = sdp_count;
1804	SDP_LIST_RUNLOCK();
1805
1806	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1807		+ n * sizeof(struct xtcpcb));
1808	if (error != 0)
1809		return (error);
1810
1811	xig.xig_len = sizeof xig;
1812	xig.xig_count = n;
1813	xig.xig_gen = 0;
1814	xig.xig_sogen = so_gencnt;
1815	error = SYSCTL_OUT(req, &xig, sizeof xig);
1816	if (error)
1817		return (error);
1818
1819	SDP_LIST_RLOCK();
1820	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1821	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1822		struct xtcpcb xt;
1823
1824		SDP_RLOCK(ssk);
1825		if (ssk->flags & SDP_TIMEWAIT) {
1826			if (ssk->cred != NULL)
1827				error = cr_cansee(req->td->td_ucred,
1828				    ssk->cred);
1829			else
1830				error = EINVAL;	/* Skip this inp. */
1831		} else if (ssk->socket)
1832			error = cr_canseesocket(req->td->td_ucred,
1833			    ssk->socket);
1834		else
1835			error = EINVAL;
1836		if (error) {
1837			error = 0;
1838			goto next;
1839		}
1840
1841		bzero(&xt, sizeof(xt));
1842		xt.xt_len = sizeof xt;
1843		xt.xt_inp.inp_gencnt = 0;
1844		xt.xt_inp.inp_vflag = INP_IPV4;
1845		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1846		xt.xt_inp.inp_lport = ssk->lport;
1847		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1848		xt.xt_inp.inp_fport = ssk->fport;
1849		xt.xt_tp.t_state = ssk->state;
1850		if (ssk->socket != NULL)
1851			sotoxsocket(ssk->socket, &xt.xt_socket);
1852		else
1853			bzero(&xt.xt_socket, sizeof xt.xt_socket);
1854		xt.xt_socket.xso_protocol = IPPROTO_TCP;
1855		SDP_RUNLOCK(ssk);
1856		error = SYSCTL_OUT(req, &xt, sizeof xt);
1857		if (error)
1858			break;
1859		i++;
1860		continue;
1861next:
1862		SDP_RUNLOCK(ssk);
1863	}
1864	if (!error) {
1865		/*
1866		 * Give the user an updated idea of our state.
1867		 * If the generation differs from what we told
1868		 * her before, she knows that something happened
1869		 * while we were processing this request, and it
1870		 * might be necessary to retry.
1871		 */
1872		xig.xig_gen = 0;
1873		xig.xig_sogen = so_gencnt;
1874		xig.xig_count = sdp_count;
1875		error = SYSCTL_OUT(req, &xig, sizeof xig);
1876	}
1877	SDP_LIST_RUNLOCK();
1878	return (error);
1879}
1880
1881static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1882
1883SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1884    CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1885    "List of active SDP connections");
1886
1887static void
1888sdp_zone_change(void *tag)
1889{
1890
1891	uma_zone_set_max(sdp_zone, maxsockets);
1892}
1893
1894static void
1895sdp_init(void)
1896{
1897
1898	LIST_INIT(&sdp_list);
1899	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1900	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1901	uma_zone_set_max(sdp_zone, maxsockets);
1902	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1903		EVENTHANDLER_PRI_ANY);
1904	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1905	ib_register_client(&sdp_client);
1906}
1907
1908extern struct domain sdpdomain;
1909
1910struct pr_usrreqs sdp_usrreqs = {
1911	.pru_abort =		sdp_abort,
1912	.pru_accept =		sdp_accept,
1913	.pru_attach =		sdp_attach,
1914	.pru_bind =		sdp_bind,
1915	.pru_connect =		sdp_connect,
1916	.pru_control =		sdp_control,
1917	.pru_detach =		sdp_detach,
1918	.pru_disconnect =	sdp_disconnect,
1919	.pru_listen =		sdp_listen,
1920	.pru_peeraddr =		sdp_getpeeraddr,
1921	.pru_rcvoob =		sdp_rcvoob,
1922	.pru_send =		sdp_send,
1923	.pru_sosend =		sdp_sosend,
1924	.pru_soreceive =	sdp_sorecv,
1925	.pru_shutdown =		sdp_shutdown,
1926	.pru_sockaddr =		sdp_getsockaddr,
1927	.pru_close =		sdp_close,
1928};
1929
1930struct protosw sdpsw[] = {
1931{
1932	.pr_type =		SOCK_STREAM,
1933	.pr_domain =		&sdpdomain,
1934	.pr_protocol =		IPPROTO_IP,
1935	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1936	.pr_ctlinput =		sdp_ctlinput,
1937	.pr_ctloutput =		sdp_ctloutput,
1938	.pr_usrreqs =		&sdp_usrreqs
1939},
1940{
1941	.pr_type =		SOCK_STREAM,
1942	.pr_domain =		&sdpdomain,
1943	.pr_protocol =		IPPROTO_TCP,
1944	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1945	.pr_ctlinput =		sdp_ctlinput,
1946	.pr_ctloutput =		sdp_ctloutput,
1947	.pr_usrreqs =		&sdp_usrreqs
1948},
1949};
1950
1951struct domain sdpdomain = {
1952	.dom_family =		AF_INET_SDP,
1953	.dom_name =		"SDP",
1954	.dom_init =		sdp_init,
1955	.dom_protosw =		sdpsw,
1956	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1957};
1958
1959DOMAIN_SET(sdp);
1960
1961int sdp_debug_level = 1;
1962int sdp_data_debug_level = 0;
1963