1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5 *      The Regents of the University of California.  All rights reserved.
6 * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
7 * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
34 */
35
36/*
37 *
38 * Copyright (c) 2010 Isilon Systems, Inc.
39 * Copyright (c) 2010 iX Systems, Inc.
40 * Copyright (c) 2010 Panasas, Inc.
41 * All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice unmodified, this list of conditions, and the following
48 *    disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 *    notice, this list of conditions and the following disclaimer in the
51 *    documentation and/or other materials provided with the distribution.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63 *
64 */
65
66#include <sys/param.h>
67#include <sys/eventhandler.h>
68#include <sys/kernel.h>
69#include <sys/malloc.h>
70
71#include "sdp.h"
72
73#include <net/if.h>
74#include <net/route.h>
75#include <net/vnet.h>
76#include <sys/sysctl.h>
77
78uma_zone_t	sdp_zone;
79struct rwlock	sdp_lock;
80LIST_HEAD(, sdp_sock) sdp_list;
81
82struct workqueue_struct *rx_comp_wq;
83
84RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
85#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
86#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
87#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
88#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
89#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
90#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
91#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
92
93MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
94
95static void sdp_stop_keepalive_timer(struct socket *so);
96
97/*
98 * SDP protocol interface to socket abstraction.
99 */
100/*
101 * sdp_sendspace and sdp_recvspace are the default send and receive window
102 * sizes, respectively.
103 */
104u_long	sdp_sendspace = 1024*32;
105u_long	sdp_recvspace = 1024*64;
106
107static int sdp_count;
108
109/*
110 * Disable async. CMA events for sockets which are being torn down.
111 */
112static void
113sdp_destroy_cma(struct sdp_sock *ssk)
114{
115
116	if (ssk->id == NULL)
117		return;
118	rdma_destroy_id(ssk->id);
119	ssk->id = NULL;
120}
121
122static int
123sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
124{
125	struct sockaddr_in *sin;
126	struct sockaddr_in null;
127	int error;
128
129	SDP_WLOCK_ASSERT(ssk);
130
131	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
132		return (EINVAL);
133	/* rdma_bind_addr handles bind races.  */
134	SDP_WUNLOCK(ssk);
135	if (ssk->id == NULL)
136		ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
137	if (ssk->id == NULL) {
138		SDP_WLOCK(ssk);
139		return (ENOMEM);
140	}
141	if (nam == NULL) {
142		null.sin_family = AF_INET;
143		null.sin_len = sizeof(null);
144		null.sin_addr.s_addr = INADDR_ANY;
145		null.sin_port = 0;
146		bzero(&null.sin_zero, sizeof(null.sin_zero));
147		nam = (struct sockaddr *)&null;
148	}
149	error = -rdma_bind_addr(ssk->id, nam);
150	SDP_WLOCK(ssk);
151	if (error == 0) {
152		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
153		ssk->laddr = sin->sin_addr.s_addr;
154		ssk->lport = sin->sin_port;
155	} else
156		sdp_destroy_cma(ssk);
157	return (error);
158}
159
160static void
161sdp_pcbfree(struct sdp_sock *ssk)
162{
163
164	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
165	KASSERT((ssk->flags & SDP_DESTROY) == 0,
166	    ("ssk %p already destroyed", ssk));
167
168	sdp_dbg(ssk->socket, "Freeing pcb");
169	SDP_WLOCK_ASSERT(ssk);
170	ssk->flags |= SDP_DESTROY;
171	SDP_WUNLOCK(ssk);
172	SDP_LIST_WLOCK();
173	sdp_count--;
174	LIST_REMOVE(ssk, list);
175	SDP_LIST_WUNLOCK();
176	crfree(ssk->cred);
177	ssk->qp_active = 0;
178	if (ssk->qp) {
179		ib_destroy_qp(ssk->qp);
180		ssk->qp = NULL;
181	}
182	sdp_tx_ring_destroy(ssk);
183	sdp_rx_ring_destroy(ssk);
184	sdp_destroy_cma(ssk);
185	rw_destroy(&ssk->rx_ring.destroyed_lock);
186	rw_destroy(&ssk->lock);
187	uma_zfree(sdp_zone, ssk);
188}
189
190static int
191sdp_getsockaddr(struct socket *so, struct sockaddr *sa)
192{
193	struct sdp_sock *ssk = sdp_sk(so);
194
195	SDP_RLOCK(ssk);
196	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
197		.sin_family = AF_INET,
198		.sin_len = sizeof(struct sockaddr_in),
199		.sin_addr.s_addr = ssk->laddr,
200		.sin_port = ssk->lport,
201	};
202	SDP_RUNLOCK(ssk);
203
204	return (0);
205}
206
207static int
208sdp_getpeeraddr(struct socket *so, struct sockaddr *sa)
209{
210	struct sdp_sock *ssk = sdp_sk(so);
211
212	SDP_RLOCK(ssk);
213	*(struct sockaddr_in *)sa = (struct sockaddr_in ){
214		.sin_family = AF_INET,
215		.sin_len = sizeof(struct sockaddr_in),
216		.sin_addr.s_addr = ssk->faddr,
217		.sin_port = ssk->fport,
218	};
219	SDP_RUNLOCK(ssk);
220
221	return (0);
222}
223
224#if 0
225static void
226sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
227{
228	struct sdp_sock *ssk;
229
230	SDP_LIST_RLOCK();
231	LIST_FOREACH(ssk, &sdp_list, list) {
232		SDP_WLOCK(ssk);
233		func(ssk, arg);
234		SDP_WUNLOCK(ssk);
235	}
236	SDP_LIST_RUNLOCK();
237}
238#endif
239
240static void
241sdp_output_reset(struct sdp_sock *ssk)
242{
243	struct rdma_cm_id *id;
244
245	SDP_WLOCK_ASSERT(ssk);
246	if (ssk->id) {
247		id = ssk->id;
248		ssk->qp_active = 0;
249		SDP_WUNLOCK(ssk);
250		rdma_disconnect(id);
251		SDP_WLOCK(ssk);
252	}
253	ssk->state = TCPS_CLOSED;
254}
255
256/*
257 * Attempt to close a SDP socket, marking it as dropped, and freeing
258 * the socket if we hold the only reference.
259 */
260static struct sdp_sock *
261sdp_closed(struct sdp_sock *ssk)
262{
263	struct socket *so;
264
265	SDP_WLOCK_ASSERT(ssk);
266
267	ssk->flags |= SDP_DROPPED;
268	so = ssk->socket;
269	soisdisconnected(so);
270	if (ssk->flags & SDP_SOCKREF) {
271		ssk->flags &= ~SDP_SOCKREF;
272		SDP_WUNLOCK(ssk);
273		sorele(so);
274		return (NULL);
275	}
276	return (ssk);
277}
278
279/*
280 * Perform timer based shutdowns which can not operate in
281 * callout context.
282 */
283static void
284sdp_shutdown_task(void *data, int pending)
285{
286	struct sdp_sock *ssk;
287
288	ssk = data;
289	SDP_WLOCK(ssk);
290	/*
291	 * I don't think this can race with another call to pcbfree()
292	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
293	 */
294	if (ssk->flags & SDP_DESTROY)
295		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
296		    ssk);
297	if (ssk->flags & SDP_DISCON)
298		sdp_output_reset(ssk);
299	/* We have to clear this so sdp_detach() will call pcbfree(). */
300	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
301	if ((ssk->flags & SDP_DROPPED) == 0 &&
302	    sdp_closed(ssk) == NULL)
303		return;
304	if (ssk->socket == NULL) {
305		sdp_pcbfree(ssk);
306		return;
307	}
308	SDP_WUNLOCK(ssk);
309}
310
311/*
312 * 2msl has expired, schedule the shutdown task.
313 */
314static void
315sdp_2msl_timeout(void *data)
316{
317	struct sdp_sock *ssk;
318
319	ssk = data;
320	/* Callout canceled. */
321        if (!callout_active(&ssk->keep2msl))
322		goto out;
323        callout_deactivate(&ssk->keep2msl);
324	/* Should be impossible, defensive programming. */
325	if ((ssk->flags & SDP_TIMEWAIT) == 0)
326		goto out;
327	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
328out:
329	SDP_WUNLOCK(ssk);
330	return;
331}
332
333/*
334 * Schedule the 2msl wait timer.
335 */
336static void
337sdp_2msl_wait(struct sdp_sock *ssk)
338{
339
340	SDP_WLOCK_ASSERT(ssk);
341	ssk->flags |= SDP_TIMEWAIT;
342	ssk->state = TCPS_TIME_WAIT;
343	soisdisconnected(ssk->socket);
344	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
345}
346
347/*
348 * Timed out waiting for the final fin/ack from rdma_disconnect().
349 */
350static void
351sdp_dreq_timeout(void *data)
352{
353	struct sdp_sock *ssk;
354
355	ssk = data;
356	/* Callout canceled. */
357        if (!callout_active(&ssk->keep2msl))
358		goto out;
359	/* Callout rescheduled, probably as a different timer. */
360	if (callout_pending(&ssk->keep2msl))
361		goto out;
362        callout_deactivate(&ssk->keep2msl);
363	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
364		goto out;
365	if ((ssk->flags & SDP_DREQWAIT) == 0)
366		goto out;
367	ssk->flags &= ~SDP_DREQWAIT;
368	ssk->flags |= SDP_DISCON;
369	sdp_2msl_wait(ssk);
370	ssk->qp_active = 0;
371out:
372	SDP_WUNLOCK(ssk);
373}
374
375/*
376 * Received the final fin/ack.  Cancel the 2msl.
377 */
378void
379sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
380{
381	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
382	ssk->flags &= ~SDP_DREQWAIT;
383	sdp_2msl_wait(ssk);
384}
385
386static int
387sdp_init_sock(struct socket *sk)
388{
389	struct sdp_sock *ssk = sdp_sk(sk);
390
391	sdp_dbg(sk, "%s\n", __func__);
392
393	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
394	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
395#ifdef SDP_ZCOPY
396	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
397	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
398	ssk->tx_ring.rdma_inflight = NULL;
399#endif
400	atomic_set(&ssk->mseq_ack, 0);
401	sdp_rx_ring_init(ssk);
402	ssk->tx_ring.buffer = NULL;
403
404	return 0;
405}
406
407/*
408 * Allocate an sdp_sock for the socket and reserve socket buffer space.
409 */
410static int
411sdp_attach(struct socket *so, int proto, struct thread *td)
412{
413	struct sdp_sock *ssk;
414	int error;
415
416	ssk = sdp_sk(so);
417	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
418	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
419		error = soreserve(so, sdp_sendspace, sdp_recvspace);
420		if (error)
421			return (error);
422	}
423	so->so_rcv.sb_flags |= SB_AUTOSIZE;
424	so->so_snd.sb_flags |= SB_AUTOSIZE;
425	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
426	if (ssk == NULL)
427		return (ENOBUFS);
428	rw_init(&ssk->lock, "sdpsock");
429	ssk->socket = so;
430	ssk->cred = crhold(so->so_cred);
431	so->so_pcb = (caddr_t)ssk;
432	sdp_init_sock(so);
433	ssk->flags = 0;
434	ssk->qp_active = 0;
435	ssk->state = TCPS_CLOSED;
436	mbufq_init(&ssk->rxctlq, INT_MAX);
437	SDP_LIST_WLOCK();
438	LIST_INSERT_HEAD(&sdp_list, ssk, list);
439	sdp_count++;
440	SDP_LIST_WUNLOCK();
441
442	return (0);
443}
444
445/*
446 * Detach SDP from the socket, potentially leaving it around for the
447 * timewait to expire.
448 */
449static void
450sdp_detach(struct socket *so)
451{
452	struct sdp_sock *ssk;
453
454	ssk = sdp_sk(so);
455	SDP_WLOCK(ssk);
456	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
457	ssk->socket->so_pcb = NULL;
458	ssk->socket = NULL;
459	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
460		SDP_WUNLOCK(ssk);
461	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
462		sdp_pcbfree(ssk);
463	else
464		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
465}
466
467/*
468 * Allocate a local address for the socket.
469 */
470static int
471sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
472{
473	int error = 0;
474	struct sdp_sock *ssk;
475	struct sockaddr_in *sin;
476
477	sin = (struct sockaddr_in *)nam;
478	if (sin->sin_family != AF_INET)
479		return (EAFNOSUPPORT);
480	if (nam->sa_len != sizeof(*sin))
481		return (EINVAL);
482	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
483		return (EAFNOSUPPORT);
484
485	ssk = sdp_sk(so);
486	SDP_WLOCK(ssk);
487	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
488		error = EINVAL;
489		goto out;
490	}
491	error = sdp_pcbbind(ssk, nam, td->td_ucred);
492out:
493	SDP_WUNLOCK(ssk);
494
495	return (error);
496}
497
498/*
499 * Prepare to accept connections.
500 */
501static int
502sdp_listen(struct socket *so, int backlog, struct thread *td)
503{
504	int error = 0;
505	struct sdp_sock *ssk;
506
507	ssk = sdp_sk(so);
508	SDP_WLOCK(ssk);
509	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
510		error = EINVAL;
511		goto out;
512	}
513	if (error == 0 && ssk->lport == 0)
514		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
515	SOCK_LOCK(so);
516	if (error == 0)
517		error = solisten_proto_check(so);
518	if (error == 0) {
519		solisten_proto(so, backlog);
520		ssk->state = TCPS_LISTEN;
521	}
522	SOCK_UNLOCK(so);
523
524out:
525	SDP_WUNLOCK(ssk);
526	if (error == 0)
527		error = -rdma_listen(ssk->id, backlog);
528	return (error);
529}
530
531/*
532 * Initiate a SDP connection to nam.
533 */
534static int
535sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
536{
537	struct sockaddr_in src;
538	struct socket *so;
539	int error;
540
541	so = ssk->socket;
542
543	SDP_WLOCK_ASSERT(ssk);
544	if (ssk->lport == 0) {
545		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
546		if (error)
547			return error;
548	}
549	src.sin_family = AF_INET;
550	src.sin_len = sizeof(src);
551	bzero(&src.sin_zero, sizeof(src.sin_zero));
552	src.sin_port = ssk->lport;
553	src.sin_addr.s_addr = ssk->laddr;
554	soisconnecting(so);
555	SDP_WUNLOCK(ssk);
556	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
557	    SDP_RESOLVE_TIMEOUT);
558	SDP_WLOCK(ssk);
559	if (error == 0)
560		ssk->state = TCPS_SYN_SENT;
561
562	return 0;
563}
564
565/*
566 * Initiate SDP connection.
567 */
568static int
569sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
570{
571	int error = 0;
572	struct sdp_sock *ssk;
573	struct sockaddr_in *sin;
574
575	sin = (struct sockaddr_in *)nam;
576	if (nam->sa_len != sizeof(*sin))
577		return (EINVAL);
578	if (sin->sin_family != AF_INET)
579		return (EAFNOSUPPORT);
580	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
581		return (EAFNOSUPPORT);
582	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
583		return (error);
584	ssk = sdp_sk(so);
585	SDP_WLOCK(ssk);
586	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
587		error = EINVAL;
588	else
589		error = sdp_start_connect(ssk, nam, td);
590	SDP_WUNLOCK(ssk);
591	return (error);
592}
593
594/*
595 * Drop a SDP socket, reporting
596 * the specified error.  If connection is synchronized,
597 * then send a RST to peer.
598 */
599static struct sdp_sock *
600sdp_drop(struct sdp_sock *ssk, int errno)
601{
602	struct socket *so;
603
604	SDP_WLOCK_ASSERT(ssk);
605	so = ssk->socket;
606	if (TCPS_HAVERCVDSYN(ssk->state))
607		sdp_output_reset(ssk);
608	if (errno == ETIMEDOUT && ssk->softerror)
609		errno = ssk->softerror;
610	so->so_error = errno;
611	return (sdp_closed(ssk));
612}
613
614/*
615 * User issued close, and wish to trail through shutdown states:
616 * if never received SYN, just forget it.  If got a SYN from peer,
617 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
618 * If already got a FIN from peer, then almost done; go to LAST_ACK
619 * state.  In all other cases, have already sent FIN to peer (e.g.
620 * after PRU_SHUTDOWN), and just have to play tedious game waiting
621 * for peer to send FIN or not respond to keep-alives, etc.
622 * We can let the user exit from the close as soon as the FIN is acked.
623 */
624static void
625sdp_usrclosed(struct sdp_sock *ssk)
626{
627
628	SDP_WLOCK_ASSERT(ssk);
629
630	switch (ssk->state) {
631	case TCPS_LISTEN:
632		ssk->state = TCPS_CLOSED;
633		SDP_WUNLOCK(ssk);
634		sdp_destroy_cma(ssk);
635		SDP_WLOCK(ssk);
636		/* FALLTHROUGH */
637	case TCPS_CLOSED:
638		ssk = sdp_closed(ssk);
639		/*
640		 * sdp_closed() should never return NULL here as the socket is
641		 * still open.
642		 */
643		KASSERT(ssk != NULL,
644		    ("sdp_usrclosed: sdp_closed() returned NULL"));
645		break;
646
647	case TCPS_SYN_SENT:
648		/* FALLTHROUGH */
649	case TCPS_SYN_RECEIVED:
650		ssk->flags |= SDP_NEEDFIN;
651		break;
652
653	case TCPS_ESTABLISHED:
654		ssk->flags |= SDP_NEEDFIN;
655		ssk->state = TCPS_FIN_WAIT_1;
656		break;
657
658	case TCPS_CLOSE_WAIT:
659		ssk->state = TCPS_LAST_ACK;
660		break;
661	}
662	if (ssk->state >= TCPS_FIN_WAIT_2) {
663		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
664		if (ssk->state == TCPS_FIN_WAIT_2)
665			sdp_2msl_wait(ssk);
666		else
667			soisdisconnected(ssk->socket);
668	}
669}
670
671static void
672sdp_output_disconnect(struct sdp_sock *ssk)
673{
674
675	SDP_WLOCK_ASSERT(ssk);
676	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
677	    sdp_dreq_timeout, ssk);
678	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
679	sdp_post_sends(ssk, M_NOWAIT);
680}
681
682/*
683 * Initiate or continue a disconnect.
684 * If embryonic state, just send reset (once).
685 * If in ``let data drain'' option and linger null, just drop.
686 * Otherwise (hard), mark socket disconnecting and drop
687 * current input data; switch states based on user close, and
688 * send segment to peer (with FIN).
689 */
690static void
691sdp_start_disconnect(struct sdp_sock *ssk)
692{
693	struct socket *so;
694	int unread;
695
696	so = ssk->socket;
697	SDP_WLOCK_ASSERT(ssk);
698	sdp_stop_keepalive_timer(so);
699	/*
700	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
701	 * socket is still open.
702	 */
703	if (ssk->state < TCPS_ESTABLISHED) {
704		ssk = sdp_closed(ssk);
705		KASSERT(ssk != NULL,
706		    ("sdp_start_disconnect: sdp_close() returned NULL"));
707	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
708		ssk = sdp_drop(ssk, 0);
709		KASSERT(ssk != NULL,
710		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
711	} else {
712		soisdisconnecting(so);
713		unread = sbused(&so->so_rcv);
714		sbflush(&so->so_rcv);
715		sdp_usrclosed(ssk);
716		if (!(ssk->flags & SDP_DROPPED)) {
717			if (unread)
718				sdp_output_reset(ssk);
719			else
720				sdp_output_disconnect(ssk);
721		}
722	}
723}
724
725/*
726 * User initiated disconnect.
727 */
728static int
729sdp_disconnect(struct socket *so)
730{
731	struct sdp_sock *ssk;
732	int error = 0;
733
734	ssk = sdp_sk(so);
735	SDP_WLOCK(ssk);
736	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
737		error = ECONNRESET;
738		goto out;
739	}
740	sdp_start_disconnect(ssk);
741out:
742	SDP_WUNLOCK(ssk);
743	return (error);
744}
745
746/*
747 * Accept a connection.  Essentially all the work is done at higher levels;
748 * just return the address of the peer, storing through addr.
749 *
750 *
751 * XXX This is broken XXX
752 *
753 * The rationale for acquiring the sdp lock here is somewhat complicated,
754 * and is described in detail in the commit log entry for r175612.  Acquiring
755 * it delays an accept(2) racing with sonewconn(), which inserts the socket
756 * before the address/port fields are initialized.  A better fix would
757 * prevent the socket from being placed in the listen queue until all fields
758 * are fully initialized.
759 */
760static int
761sdp_accept(struct socket *so, struct sockaddr *sa)
762{
763	struct sdp_sock *ssk = NULL;
764	int error;
765
766	if (so->so_state & SS_ISDISCONNECTED)
767		return (ECONNABORTED);
768
769	error = 0;
770	ssk = sdp_sk(so);
771	SDP_WLOCK(ssk);
772	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
773		error = ECONNABORTED;
774	else
775		*(struct sockaddr_in *)sa = (struct sockaddr_in ){
776			.sin_family = AF_INET,
777			.sin_len = sizeof(struct sockaddr_in),
778			.sin_addr.s_addr = ssk->faddr,
779			.sin_port = ssk->fport,
780		};
781	SDP_WUNLOCK(ssk);
782
783	return (error);
784}
785
786/*
787 * Mark the connection as being incapable of further output.
788 */
789static int
790sdp_shutdown(struct socket *so, enum shutdown_how how)
791{
792	struct sdp_sock *ssk = sdp_sk(so);
793	int error = 0;
794
795	SOCK_LOCK(so);
796	if ((so->so_state &
797	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
798		SOCK_UNLOCK(so);
799		return (ENOTCONN);
800	}
801	if (SOLISTENING(so)) {
802		if (how != SHUT_WR) {
803			so->so_error = ECONNABORTED;
804			solisten_wakeup(so);	/* unlocks so */
805		} else
806			SOCK_UNLOCK(so);
807		return (0);
808	}
809	SOCK_UNLOCK(so);
810
811	switch (how) {
812	case SHUT_RD:
813		socantrcvmore(so);
814		sbrelease(so, SO_RCV);
815		break;
816	case SHUT_RDWR:
817		socantrcvmore(so);
818		sbrelease(so, SO_RCV);
819		/* FALLTHROUGH */
820	case SHUT_WR:
821		SDP_WLOCK(ssk);
822		if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
823			SDP_WUNLOCK(ssk);
824			error = ECONNRESET;
825			break;
826		}
827		socantsendmore(so);
828		sdp_usrclosed(ssk);
829		if (!(ssk->flags & SDP_DROPPED))
830			sdp_output_disconnect(ssk);
831		SDP_WUNLOCK(ssk);
832	}
833	wakeup(&so->so_timeo);
834
835	return (error);
836}
837
838static void
839sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
840{
841	struct mbuf *n;
842	int ncnt;
843
844	SOCKBUF_LOCK_ASSERT(sb);
845	SBLASTRECORDCHK(sb);
846	KASSERT(mb->m_flags & M_PKTHDR,
847		("sdp_append: %p Missing packet header.\n", mb));
848	n = sb->sb_lastrecord;
849	/*
850	 * If the queue is empty just set all pointers and proceed.
851	 */
852	if (n == NULL) {
853		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
854		for (; mb; mb = mb->m_next) {
855	                sb->sb_mbtail = mb;
856			sballoc(sb, mb);
857		}
858		return;
859	}
860	/*
861	 * Count the number of mbufs in the current tail.
862	 */
863	for (ncnt = 0; n->m_next; n = n->m_next)
864		ncnt++;
865	n = sb->sb_lastrecord;
866	/*
867	 * If the two chains can fit in a single sdp packet and
868	 * the last record has not been sent yet (WRITABLE) coalesce
869	 * them.  The lastrecord remains the same but we must strip the
870	 * packet header and then let sbcompress do the hard part.
871	 */
872	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
873	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
874	    ssk->xmit_size_goal) {
875		m_adj(mb, SDP_HEAD_SIZE);
876		n->m_pkthdr.len += mb->m_pkthdr.len;
877		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
878		m_demote(mb, 1, 0);
879		sbcompress(sb, mb, sb->sb_mbtail);
880		return;
881	}
882	/*
883	 * Not compressible, just append to the end and adjust counters.
884	 */
885	sb->sb_lastrecord->m_flags |= M_PUSH;
886	sb->sb_lastrecord->m_nextpkt = mb;
887	sb->sb_lastrecord = mb;
888	if (sb->sb_sndptr == NULL)
889		sb->sb_sndptr = mb;
890	for (; mb; mb = mb->m_next) {
891		sb->sb_mbtail = mb;
892		sballoc(sb, mb);
893	}
894}
895
896/*
897 * Do a send by putting data in output queue and updating urgent
898 * marker if URG set.  Possibly send more data.  Unlike the other
899 * pru_*() routines, the mbuf chains are our responsibility.  We
900 * must either enqueue them or free them.  The other pru_* routines
901 * generally are caller-frees.
902 *
903 * This comes from sendfile, normal sends will come from sdp_sosend().
904 */
905static int
906sdp_send(struct socket *so, int flags, struct mbuf *m,
907    struct sockaddr *nam, struct mbuf *control, struct thread *td)
908{
909	struct sdp_sock *ssk;
910	struct mbuf *n;
911	int error;
912	int cnt;
913
914	if (nam != NULL) {
915		if (nam->sa_family != AF_INET) {
916			if (control)
917				m_freem(control);
918			m_freem(m);
919			return (EAFNOSUPPORT);
920		}
921		if (nam->sa_len != sizeof(struct sockaddr_in)) {
922			if (control)
923				m_freem(control);
924			m_freem(m);
925			return (EINVAL);
926		}
927	}
928
929	error = 0;
930	ssk = sdp_sk(so);
931	KASSERT(m->m_flags & M_PKTHDR,
932	    ("sdp_send: %p no packet header", m));
933	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
934	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
935	for (n = m, cnt = 0; n->m_next; n = n->m_next)
936		cnt++;
937	if (cnt > SDP_MAX_SEND_SGES) {
938		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
939		if (n == NULL) {
940			m_freem(m);
941			return (EMSGSIZE);
942		}
943		m = n;
944		for (cnt = 0; n->m_next; n = n->m_next)
945			cnt++;
946	}
947	SDP_WLOCK(ssk);
948	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
949		if (control)
950			m_freem(control);
951		if (m)
952			m_freem(m);
953		error = ECONNRESET;
954		goto out;
955	}
956	if (control) {
957		/* SDP doesn't support control messages. */
958		if (control->m_len) {
959			m_freem(control);
960			if (m)
961				m_freem(m);
962			error = EINVAL;
963			goto out;
964		}
965		m_freem(control);	/* empty control, just free it */
966	}
967	if (!(flags & PRUS_OOB)) {
968		SOCKBUF_LOCK(&so->so_snd);
969		sdp_append(ssk, &so->so_snd, m, cnt);
970		SOCKBUF_UNLOCK(&so->so_snd);
971		if (nam && ssk->state < TCPS_SYN_SENT) {
972			/*
973			 * Do implied connect if not yet connected.
974			 */
975			error = sdp_start_connect(ssk, nam, td);
976			if (error)
977				goto out;
978		}
979		if (flags & PRUS_EOF) {
980			/*
981			 * Close the send side of the connection after
982			 * the data is sent.
983			 */
984			socantsendmore(so);
985			sdp_usrclosed(ssk);
986			if (!(ssk->flags & SDP_DROPPED))
987				sdp_output_disconnect(ssk);
988		} else if (!(ssk->flags & SDP_DROPPED) &&
989		    !(flags & PRUS_MORETOCOME))
990			sdp_post_sends(ssk, M_NOWAIT);
991		SDP_WUNLOCK(ssk);
992		return (0);
993	} else {
994		SOCKBUF_LOCK(&so->so_snd);
995		if (sbspace(&so->so_snd) < -512) {
996			SOCKBUF_UNLOCK(&so->so_snd);
997			m_freem(m);
998			error = ENOBUFS;
999			goto out;
1000		}
1001		/*
1002		 * According to RFC961 (Assigned Protocols),
1003		 * the urgent pointer points to the last octet
1004		 * of urgent data.  We continue, however,
1005		 * to consider it to indicate the first octet
1006		 * of data past the urgent section.
1007		 * Otherwise, snd_up should be one lower.
1008		 */
1009		m->m_flags |= M_URG | M_PUSH;
1010		sdp_append(ssk, &so->so_snd, m, cnt);
1011		SOCKBUF_UNLOCK(&so->so_snd);
1012		if (nam && ssk->state < TCPS_SYN_SENT) {
1013			/*
1014			 * Do implied connect if not yet connected.
1015			 */
1016			error = sdp_start_connect(ssk, nam, td);
1017			if (error)
1018				goto out;
1019		}
1020		sdp_post_sends(ssk, M_NOWAIT);
1021		SDP_WUNLOCK(ssk);
1022		return (0);
1023	}
1024out:
1025	SDP_WUNLOCK(ssk);
1026	return (error);
1027}
1028
1029/*
1030 * Send on a socket.  If send must go all at once and message is larger than
1031 * send buffering, then hard error.  Lock against other senders.  If must go
1032 * all at once and not enough room now, then inform user that this would
1033 * block and do nothing.  Otherwise, if nonblocking, send as much as
1034 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1035 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1036 * in mbuf chain must be small enough to send all at once.
1037 *
1038 * Returns nonzero on error, timeout or signal; callers must check for short
1039 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1040 * on return.
1041 */
1042static int
1043sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1044    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1045{
1046	struct sdp_sock *ssk;
1047	long space, resid;
1048	int atomic;
1049	int error;
1050	int copy;
1051
1052	if (uio != NULL)
1053		resid = uio->uio_resid;
1054	else
1055		resid = top->m_pkthdr.len;
1056	atomic = top != NULL;
1057	if (control != NULL) {
1058		if (control->m_len) {
1059			m_freem(control);
1060			if (top)
1061				m_freem(top);
1062			return (EINVAL);
1063		}
1064		m_freem(control);
1065		control = NULL;
1066	}
1067	/*
1068	 * In theory resid should be unsigned.  However, space must be
1069	 * signed, as it might be less than 0 if we over-committed, and we
1070	 * must use a signed comparison of space and resid.  On the other
1071	 * hand, a negative resid causes us to loop sending 0-length
1072	 * segments to the protocol.
1073	 *
1074	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1075	 * type sockets since that's an error.
1076	 */
1077	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1078		error = EINVAL;
1079		goto out;
1080	}
1081	if (td != NULL)
1082		td->td_ru.ru_msgsnd++;
1083
1084	ssk = sdp_sk(so);
1085	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1086	if (error)
1087		goto out;
1088
1089restart:
1090	do {
1091		SOCKBUF_LOCK(&so->so_snd);
1092		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1093			SOCKBUF_UNLOCK(&so->so_snd);
1094			error = EPIPE;
1095			goto release;
1096		}
1097		if (so->so_error) {
1098			error = so->so_error;
1099			so->so_error = 0;
1100			SOCKBUF_UNLOCK(&so->so_snd);
1101			goto release;
1102		}
1103		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1104			SOCKBUF_UNLOCK(&so->so_snd);
1105			error = ENOTCONN;
1106			goto release;
1107		}
1108		space = sbspace(&so->so_snd);
1109		if (flags & MSG_OOB)
1110			space += 1024;
1111		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1112			SOCKBUF_UNLOCK(&so->so_snd);
1113			error = EMSGSIZE;
1114			goto release;
1115		}
1116		if (space < resid &&
1117		    (atomic || space < so->so_snd.sb_lowat)) {
1118			if ((so->so_state & SS_NBIO) ||
1119			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1120				SOCKBUF_UNLOCK(&so->so_snd);
1121				error = EWOULDBLOCK;
1122				goto release;
1123			}
1124			error = sbwait(so, SO_SND);
1125			SOCKBUF_UNLOCK(&so->so_snd);
1126			if (error)
1127				goto release;
1128			goto restart;
1129		}
1130		SOCKBUF_UNLOCK(&so->so_snd);
1131		do {
1132			if (uio == NULL) {
1133				resid = 0;
1134				if (flags & MSG_EOR)
1135					top->m_flags |= M_EOR;
1136			} else {
1137				/*
1138				 * Copy the data from userland into a mbuf
1139				 * chain.  If no data is to be copied in,
1140				 * a single empty mbuf is returned.
1141				 */
1142				copy = min(space,
1143				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1144				top = m_uiotombuf(uio, M_WAITOK, copy,
1145				    0, M_PKTHDR |
1146				    ((flags & MSG_EOR) ? M_EOR : 0));
1147				if (top == NULL) {
1148					/* only possible error */
1149					error = EFAULT;
1150					goto release;
1151				}
1152				space -= resid - uio->uio_resid;
1153				resid = uio->uio_resid;
1154			}
1155			/*
1156			 * XXX all the SBS_CANTSENDMORE checks previously
1157			 * done could be out of date after dropping the
1158			 * socket lock.
1159			 */
1160			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1161			/*
1162			 * Set EOF on the last send if the user specified
1163			 * MSG_EOF.
1164			 */
1165			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1166			/* If there is more to send set PRUS_MORETOCOME. */
1167			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1168			    top, addr, NULL, td);
1169			top = NULL;
1170			if (error)
1171				goto release;
1172		} while (resid && space > 0);
1173	} while (resid);
1174
1175release:
1176	SOCK_IO_SEND_UNLOCK(so);
1177out:
1178	if (top != NULL)
1179		m_freem(top);
1180	return (error);
1181}
1182
1183/*
1184 * The part of soreceive() that implements reading non-inline out-of-band
1185 * data from a socket.  For more complete comments, see soreceive(), from
1186 * which this code originated.
1187 *
1188 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1189 * unable to return an mbuf chain to the caller.
1190 */
1191static int
1192soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1193{
1194	struct protosw *pr = so->so_proto;
1195	struct mbuf *m;
1196	int error;
1197
1198	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1199
1200	m = m_get(M_WAITOK, MT_DATA);
1201	error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
1202	if (error)
1203		goto bad;
1204	do {
1205		error = uiomove(mtod(m, void *),
1206		    (int) min(uio->uio_resid, m->m_len), uio);
1207		m = m_free(m);
1208	} while (uio->uio_resid && error == 0 && m);
1209bad:
1210	if (m != NULL)
1211		m_freem(m);
1212	return (error);
1213}
1214
1215/*
1216 * Optimized version of soreceive() for stream (TCP) sockets.
1217 */
1218static int
1219sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1220    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1221{
1222	int len = 0, error = 0, flags, oresid;
1223	struct sockbuf *sb;
1224	struct mbuf *m, *n = NULL;
1225	struct sdp_sock *ssk;
1226
1227	/* We only do stream sockets. */
1228	if (so->so_type != SOCK_STREAM)
1229		return (EINVAL);
1230	if (psa != NULL)
1231		*psa = NULL;
1232	if (controlp != NULL)
1233		return (EINVAL);
1234	if (flagsp != NULL)
1235		flags = *flagsp &~ MSG_EOR;
1236	else
1237		flags = 0;
1238	if (flags & MSG_OOB)
1239		return (soreceive_rcvoob(so, uio, flags));
1240	if (mp0 != NULL)
1241		*mp0 = NULL;
1242
1243	sb = &so->so_rcv;
1244	ssk = sdp_sk(so);
1245
1246	/* Prevent other readers from entering the socket. */
1247	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1248	if (error)
1249		return (error);
1250	SOCKBUF_LOCK(sb);
1251
1252	/* Easy one, no space to copyout anything. */
1253	if (uio->uio_resid == 0) {
1254		error = EINVAL;
1255		goto out;
1256	}
1257	oresid = uio->uio_resid;
1258
1259	/* We will never ever get anything unless we are connected. */
1260	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1261		/* When disconnecting there may be still some data left. */
1262		if (sbavail(sb))
1263			goto deliver;
1264		if (!(so->so_state & SS_ISDISCONNECTED))
1265			error = ENOTCONN;
1266		goto out;
1267	}
1268
1269	/* Socket buffer is empty and we shall not block. */
1270	if (sbavail(sb) == 0 &&
1271	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1272		error = EAGAIN;
1273		goto out;
1274	}
1275
1276restart:
1277	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1278
1279	/* Abort if socket has reported problems. */
1280	if (so->so_error) {
1281		if (sbavail(sb))
1282			goto deliver;
1283		if (oresid > uio->uio_resid)
1284			goto out;
1285		error = so->so_error;
1286		if (!(flags & MSG_PEEK))
1287			so->so_error = 0;
1288		goto out;
1289	}
1290
1291	/* Door is closed.  Deliver what is left, if any. */
1292	if (sb->sb_state & SBS_CANTRCVMORE) {
1293		if (sbavail(sb))
1294			goto deliver;
1295		else
1296			goto out;
1297	}
1298
1299	/* Socket buffer got some data that we shall deliver now. */
1300	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1301	    ((so->so_state & SS_NBIO) ||
1302	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1303	     sbavail(sb) >= sb->sb_lowat ||
1304	     sbavail(sb) >= uio->uio_resid ||
1305	     sbavail(sb) >= sb->sb_hiwat) ) {
1306		goto deliver;
1307	}
1308
1309	/* On MSG_WAITALL we must wait until all data or error arrives. */
1310	if ((flags & MSG_WAITALL) &&
1311	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1312		goto deliver;
1313
1314	/*
1315	 * Wait and block until (more) data comes in.
1316	 * NB: Drops the sockbuf lock during wait.
1317	 */
1318	error = sbwait(so, SO_RCV);
1319	if (error)
1320		goto out;
1321	goto restart;
1322
1323deliver:
1324	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1325	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1326	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1327
1328	/* Statistics. */
1329	if (uio->uio_td)
1330		uio->uio_td->td_ru.ru_msgrcv++;
1331
1332	/* Fill uio until full or current end of socket buffer is reached. */
1333	len = min(uio->uio_resid, sbavail(sb));
1334	if (mp0 != NULL) {
1335		/* Dequeue as many mbufs as possible. */
1336		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1337			for (*mp0 = m = sb->sb_mb;
1338			     m != NULL && m->m_len <= len;
1339			     m = m->m_next) {
1340				len -= m->m_len;
1341				uio->uio_resid -= m->m_len;
1342				sbfree(sb, m);
1343				n = m;
1344			}
1345			sb->sb_mb = m;
1346			if (sb->sb_mb == NULL)
1347				SB_EMPTY_FIXUP(sb);
1348			n->m_next = NULL;
1349		}
1350		/* Copy the remainder. */
1351		if (len > 0) {
1352			KASSERT(sb->sb_mb != NULL,
1353			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1354
1355			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1356			if (m == NULL)
1357				len = 0;	/* Don't flush data from sockbuf. */
1358			else
1359				uio->uio_resid -= m->m_len;
1360			if (*mp0 != NULL)
1361				n->m_next = m;
1362			else
1363				*mp0 = m;
1364			if (*mp0 == NULL) {
1365				error = ENOBUFS;
1366				goto out;
1367			}
1368		}
1369	} else {
1370		/* NB: Must unlock socket buffer as uiomove may sleep. */
1371		SOCKBUF_UNLOCK(sb);
1372		error = m_mbuftouio(uio, sb->sb_mb, len);
1373		SOCKBUF_LOCK(sb);
1374		if (error)
1375			goto out;
1376	}
1377	SBLASTRECORDCHK(sb);
1378	SBLASTMBUFCHK(sb);
1379
1380	/*
1381	 * Remove the delivered data from the socket buffer unless we
1382	 * were only peeking.
1383	 */
1384	if (!(flags & MSG_PEEK)) {
1385		if (len > 0)
1386			sbdrop_locked(sb, len);
1387
1388		/* Notify protocol that we drained some data. */
1389		SOCKBUF_UNLOCK(sb);
1390		SDP_WLOCK(ssk);
1391		sdp_do_posts(ssk);
1392		SDP_WUNLOCK(ssk);
1393		SOCKBUF_LOCK(sb);
1394	}
1395
1396	/*
1397	 * For MSG_WAITALL we may have to loop again and wait for
1398	 * more data to come in.
1399	 */
1400	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1401		goto restart;
1402out:
1403	SBLASTRECORDCHK(sb);
1404	SBLASTMBUFCHK(sb);
1405	SOCKBUF_UNLOCK(sb);
1406	SOCK_IO_RECV_UNLOCK(so);
1407	return (error);
1408}
1409
1410/*
1411 * Abort is used to teardown a connection typically while sitting in
1412 * the accept queue.
1413 */
1414void
1415sdp_abort(struct socket *so)
1416{
1417	struct sdp_sock *ssk;
1418
1419	ssk = sdp_sk(so);
1420	SDP_WLOCK(ssk);
1421	/*
1422	 * If we have not yet dropped, do it now.
1423	 */
1424	if (!(ssk->flags & SDP_TIMEWAIT) &&
1425	    !(ssk->flags & SDP_DROPPED))
1426		sdp_drop(ssk, ECONNABORTED);
1427	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1428	    ssk, ssk->flags));
1429	SDP_WUNLOCK(ssk);
1430}
1431
1432/*
1433 * Close a SDP socket and initiate a friendly disconnect.
1434 */
1435static void
1436sdp_close(struct socket *so)
1437{
1438	struct sdp_sock *ssk;
1439
1440	ssk = sdp_sk(so);
1441	SDP_WLOCK(ssk);
1442	/*
1443	 * If we have not yet dropped, do it now.
1444	 */
1445	if (!(ssk->flags & SDP_TIMEWAIT) &&
1446	    !(ssk->flags & SDP_DROPPED))
1447		sdp_start_disconnect(ssk);
1448
1449	/*
1450	 * If we've still not dropped let the socket layer know we're
1451	 * holding on to the socket and pcb for a while.
1452	 */
1453	if (!(ssk->flags & SDP_DROPPED)) {
1454		ssk->flags |= SDP_SOCKREF;
1455		soref(so);
1456	}
1457	SDP_WUNLOCK(ssk);
1458}
1459
1460/*
1461 * User requests out-of-band data.
1462 */
1463static int
1464sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1465{
1466	int error = 0;
1467	struct sdp_sock *ssk;
1468
1469	ssk = sdp_sk(so);
1470	SDP_WLOCK(ssk);
1471	if (!rx_ring_trylock(&ssk->rx_ring)) {
1472		SDP_WUNLOCK(ssk);
1473		return (ECONNRESET);
1474	}
1475	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1476		error = ECONNRESET;
1477		goto out;
1478	}
1479	if ((so->so_oobmark == 0 &&
1480	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1481	    so->so_options & SO_OOBINLINE ||
1482	    ssk->oobflags & SDP_HADOOB) {
1483		error = EINVAL;
1484		goto out;
1485	}
1486	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1487		error = EWOULDBLOCK;
1488		goto out;
1489	}
1490	m->m_len = 1;
1491	*mtod(m, caddr_t) = ssk->iobc;
1492	if ((flags & MSG_PEEK) == 0)
1493		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1494out:
1495	rx_ring_unlock(&ssk->rx_ring);
1496	SDP_WUNLOCK(ssk);
1497	return (error);
1498}
1499
1500void
1501sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1502{
1503	struct mbuf *m;
1504	struct socket *so;
1505
1506	so = ssk->socket;
1507	if (so == NULL)
1508		return;
1509
1510	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1511	sohasoutofband(so);
1512	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1513	if (!(so->so_options & SO_OOBINLINE)) {
1514		for (m = mb; m->m_next != NULL; m = m->m_next);
1515		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1516		ssk->oobflags |= SDP_HAVEOOB;
1517		m->m_len--;
1518		mb->m_pkthdr.len--;
1519	}
1520}
1521
1522/*
1523 * Notify a sdp socket of an asynchronous error.
1524 *
1525 * Do not wake up user since there currently is no mechanism for
1526 * reporting soft errors (yet - a kqueue filter may be added).
1527 */
1528struct sdp_sock *
1529sdp_notify(struct sdp_sock *ssk, int error)
1530{
1531
1532	SDP_WLOCK_ASSERT(ssk);
1533
1534	if ((ssk->flags & SDP_TIMEWAIT) ||
1535	    (ssk->flags & SDP_DROPPED))
1536		return (ssk);
1537
1538	/*
1539	 * Ignore some errors if we are hooked up.
1540	 */
1541	if (ssk->state == TCPS_ESTABLISHED &&
1542	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1543	     error == EHOSTDOWN))
1544		return (ssk);
1545	ssk->softerror = error;
1546	return sdp_drop(ssk, error);
1547}
1548
1549static void
1550sdp_keepalive_timeout(void *data)
1551{
1552	struct sdp_sock *ssk;
1553
1554	ssk = data;
1555	/* Callout canceled. */
1556        if (!callout_active(&ssk->keep2msl))
1557                return;
1558	/* Callout rescheduled as a different kind of timer. */
1559	if (callout_pending(&ssk->keep2msl))
1560		goto out;
1561        callout_deactivate(&ssk->keep2msl);
1562	if (ssk->flags & SDP_DROPPED ||
1563	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1564		goto out;
1565	sdp_post_keepalive(ssk);
1566	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1567	    sdp_keepalive_timeout, ssk);
1568out:
1569	SDP_WUNLOCK(ssk);
1570}
1571
1572
1573void
1574sdp_start_keepalive_timer(struct socket *so)
1575{
1576	struct sdp_sock *ssk;
1577
1578	ssk = sdp_sk(so);
1579	if (!callout_pending(&ssk->keep2msl))
1580                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1581                    sdp_keepalive_timeout, ssk);
1582}
1583
1584static void
1585sdp_stop_keepalive_timer(struct socket *so)
1586{
1587	struct sdp_sock *ssk;
1588
1589	ssk = sdp_sk(so);
1590	callout_stop(&ssk->keep2msl);
1591}
1592
1593/*
1594 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1595 * socket option arguments.  When it re-acquires the lock after the copy, it
1596 * has to revalidate that the connection is still valid for the socket
1597 * option.
1598 */
1599#define SDP_WLOCK_RECHECK(inp) do {					\
1600	SDP_WLOCK(ssk);							\
1601	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1602		SDP_WUNLOCK(ssk);					\
1603		return (ECONNRESET);					\
1604	}								\
1605} while(0)
1606
1607static int
1608sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1609{
1610	int	error, opt, optval;
1611	struct sdp_sock *ssk;
1612
1613	error = 0;
1614	ssk = sdp_sk(so);
1615	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1616		SDP_WLOCK(ssk);
1617		if (so->so_options & SO_KEEPALIVE)
1618			sdp_start_keepalive_timer(so);
1619		else
1620			sdp_stop_keepalive_timer(so);
1621		SDP_WUNLOCK(ssk);
1622	}
1623	if (sopt->sopt_level != IPPROTO_TCP)
1624		return (error);
1625
1626	SDP_WLOCK(ssk);
1627	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1628		SDP_WUNLOCK(ssk);
1629		return (ECONNRESET);
1630	}
1631
1632	switch (sopt->sopt_dir) {
1633	case SOPT_SET:
1634		switch (sopt->sopt_name) {
1635		case TCP_NODELAY:
1636			SDP_WUNLOCK(ssk);
1637			error = sooptcopyin(sopt, &optval, sizeof optval,
1638			    sizeof optval);
1639			if (error)
1640				return (error);
1641
1642			SDP_WLOCK_RECHECK(ssk);
1643			opt = SDP_NODELAY;
1644			if (optval)
1645				ssk->flags |= opt;
1646			else
1647				ssk->flags &= ~opt;
1648			sdp_do_posts(ssk);
1649			SDP_WUNLOCK(ssk);
1650			break;
1651
1652		default:
1653			SDP_WUNLOCK(ssk);
1654			error = ENOPROTOOPT;
1655			break;
1656		}
1657		break;
1658
1659	case SOPT_GET:
1660		switch (sopt->sopt_name) {
1661		case TCP_NODELAY:
1662			optval = ssk->flags & SDP_NODELAY;
1663			SDP_WUNLOCK(ssk);
1664			error = sooptcopyout(sopt, &optval, sizeof optval);
1665			break;
1666		default:
1667			SDP_WUNLOCK(ssk);
1668			error = ENOPROTOOPT;
1669			break;
1670		}
1671		break;
1672	}
1673	return (error);
1674}
1675#undef SDP_WLOCK_RECHECK
1676
1677int sdp_mod_count = 0;
1678int sdp_mod_usec = 0;
1679
1680void
1681sdp_set_default_moderation(struct sdp_sock *ssk)
1682{
1683	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1684		return;
1685	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1686}
1687
1688static void
1689sdp_dev_add(struct ib_device *device)
1690{
1691	struct ib_fmr_pool_param param;
1692	struct sdp_device *sdp_dev;
1693
1694	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1695	sdp_dev->pd = ib_alloc_pd(device, 0);
1696	if (IS_ERR(sdp_dev->pd))
1697		goto out_pd;
1698	memset(&param, 0, sizeof param);
1699	param.max_pages_per_fmr = SDP_FMR_SIZE;
1700	param.page_shift = PAGE_SHIFT;
1701	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1702	param.pool_size = SDP_FMR_POOL_SIZE;
1703	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1704	param.cache = 1;
1705	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1706	if (IS_ERR(sdp_dev->fmr_pool))
1707		goto out_fmr;
1708	ib_set_client_data(device, &sdp_client, sdp_dev);
1709	return;
1710
1711out_fmr:
1712	ib_dealloc_pd(sdp_dev->pd);
1713out_pd:
1714	free(sdp_dev, M_SDP);
1715}
1716
1717static void
1718sdp_dev_rem(struct ib_device *device, void *client_data)
1719{
1720	struct sdp_device *sdp_dev;
1721	struct sdp_sock *ssk;
1722
1723	SDP_LIST_WLOCK();
1724	LIST_FOREACH(ssk, &sdp_list, list) {
1725		if (ssk->ib_device != device)
1726			continue;
1727		SDP_WLOCK(ssk);
1728		if ((ssk->flags & SDP_DESTROY) == 0)
1729			ssk = sdp_notify(ssk, ECONNRESET);
1730		if (ssk)
1731			SDP_WUNLOCK(ssk);
1732	}
1733	SDP_LIST_WUNLOCK();
1734	/*
1735	 * XXX Do I need to wait between these two?
1736	 */
1737	sdp_dev = ib_get_client_data(device, &sdp_client);
1738	if (!sdp_dev)
1739		return;
1740	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1741	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1742	ib_dealloc_pd(sdp_dev->pd);
1743	free(sdp_dev, M_SDP);
1744}
1745
1746struct ib_client sdp_client =
1747    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1748
1749
1750static int
1751sdp_pcblist(SYSCTL_HANDLER_ARGS)
1752{
1753	int error, n, i;
1754	struct sdp_sock *ssk;
1755	struct xinpgen xig;
1756
1757	/*
1758	 * The process of preparing the TCB list is too time-consuming and
1759	 * resource-intensive to repeat twice on every request.
1760	 */
1761	if (req->oldptr == NULL) {
1762		n = sdp_count;
1763		n += imax(n / 8, 10);
1764		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1765		return (0);
1766	}
1767
1768	if (req->newptr != NULL)
1769		return (EPERM);
1770
1771	/*
1772	 * OK, now we're committed to doing something.
1773	 */
1774	SDP_LIST_RLOCK();
1775	n = sdp_count;
1776	SDP_LIST_RUNLOCK();
1777
1778	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1779		+ n * sizeof(struct xtcpcb));
1780	if (error != 0)
1781		return (error);
1782
1783	bzero(&xig, sizeof(xig));
1784	xig.xig_len = sizeof xig;
1785	xig.xig_count = n;
1786	xig.xig_gen = 0;
1787	xig.xig_sogen = so_gencnt;
1788	error = SYSCTL_OUT(req, &xig, sizeof xig);
1789	if (error)
1790		return (error);
1791
1792	SDP_LIST_RLOCK();
1793	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1794	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1795		struct xtcpcb xt;
1796
1797		SDP_RLOCK(ssk);
1798		if (ssk->flags & SDP_TIMEWAIT) {
1799			if (ssk->cred != NULL)
1800				error = cr_cansee(req->td->td_ucred,
1801				    ssk->cred);
1802			else
1803				error = EINVAL;	/* Skip this inp. */
1804		} else if (ssk->socket)
1805			error = cr_canseesocket(req->td->td_ucred,
1806			    ssk->socket);
1807		else
1808			error = EINVAL;
1809		if (error) {
1810			error = 0;
1811			goto next;
1812		}
1813
1814		bzero(&xt, sizeof(xt));
1815		xt.xt_len = sizeof xt;
1816		xt.xt_inp.inp_gencnt = 0;
1817		xt.xt_inp.inp_vflag = INP_IPV4;
1818		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1819		xt.xt_inp.inp_lport = ssk->lport;
1820		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1821		xt.xt_inp.inp_fport = ssk->fport;
1822		xt.t_state = ssk->state;
1823		if (ssk->socket != NULL)
1824			sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
1825		xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
1826		SDP_RUNLOCK(ssk);
1827		error = SYSCTL_OUT(req, &xt, sizeof xt);
1828		if (error)
1829			break;
1830		i++;
1831		continue;
1832next:
1833		SDP_RUNLOCK(ssk);
1834	}
1835	if (!error) {
1836		/*
1837		 * Give the user an updated idea of our state.
1838		 * If the generation differs from what we told
1839		 * her before, she knows that something happened
1840		 * while we were processing this request, and it
1841		 * might be necessary to retry.
1842		 */
1843		xig.xig_gen = 0;
1844		xig.xig_sogen = so_gencnt;
1845		xig.xig_count = sdp_count;
1846		error = SYSCTL_OUT(req, &xig, sizeof xig);
1847	}
1848	SDP_LIST_RUNLOCK();
1849	return (error);
1850}
1851
1852SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1853    "SDP");
1854
1855SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1856    CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE,
1857    0, 0, sdp_pcblist, "S,xtcpcb",
1858    "List of active SDP connections");
1859
1860static void
1861sdp_zone_change(void *tag)
1862{
1863
1864	uma_zone_set_max(sdp_zone, maxsockets);
1865}
1866
1867static void
1868sdp_init(void *arg __unused)
1869{
1870
1871	LIST_INIT(&sdp_list);
1872	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1873	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1874	uma_zone_set_max(sdp_zone, maxsockets);
1875	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1876		EVENTHANDLER_PRI_ANY);
1877	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1878	ib_register_client(&sdp_client);
1879}
1880SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL);
1881
1882#define	SDP_PROTOSW							\
1883	.pr_type =		SOCK_STREAM,				\
1884	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,\
1885	.pr_ctloutput =		sdp_ctloutput,				\
1886	.pr_abort =		sdp_abort,				\
1887	.pr_accept =		sdp_accept,				\
1888	.pr_attach =		sdp_attach,				\
1889	.pr_bind =		sdp_bind,				\
1890	.pr_connect =		sdp_connect,				\
1891	.pr_detach =		sdp_detach,				\
1892	.pr_disconnect =	sdp_disconnect,				\
1893	.pr_listen =		sdp_listen,				\
1894	.pr_peeraddr =		sdp_getpeeraddr,			\
1895	.pr_rcvoob =		sdp_rcvoob,				\
1896	.pr_send =		sdp_send,				\
1897	.pr_sosend =		sdp_sosend,				\
1898	.pr_soreceive =		sdp_sorecv,				\
1899	.pr_shutdown =		sdp_shutdown,				\
1900	.pr_sockaddr =		sdp_getsockaddr,			\
1901	.pr_close =		sdp_close
1902
1903
1904static struct protosw sdp_ip_protosw = {
1905	.pr_protocol =		IPPROTO_IP,
1906	SDP_PROTOSW
1907};
1908static struct protosw sdp_tcp_protosw = {
1909	.pr_protocol =		IPPROTO_TCP,
1910	SDP_PROTOSW
1911};
1912
1913static struct domain sdpdomain = {
1914	.dom_family =		AF_INET_SDP,
1915	.dom_name =		"SDP",
1916	.dom_nprotosw =		2,
1917	.dom_protosw = {
1918		&sdp_ip_protosw,
1919		&sdp_tcp_protosw,
1920	},
1921};
1922
1923DOMAIN_SET(sdp);
1924
1925int sdp_debug_level = 1;
1926int sdp_data_debug_level = 0;
1927