1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5 *      The Regents of the University of California.  All rights reserved.
6 * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
7 * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
34 */
35
36/*
37 *
38 * Copyright (c) 2010 Isilon Systems, Inc.
39 * Copyright (c) 2010 iX Systems, Inc.
40 * Copyright (c) 2010 Panasas, Inc.
41 * All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice unmodified, this list of conditions, and the following
48 *    disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 *    notice, this list of conditions and the following disclaimer in the
51 *    documentation and/or other materials provided with the distribution.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63 *
64 */
65#include <sys/cdefs.h>
66__FBSDID("$FreeBSD$");
67
68#include <sys/param.h>
69#include <sys/kernel.h>
70#include <sys/malloc.h>
71
72#include "sdp.h"
73
74#include <net/if.h>
75#include <net/route.h>
76#include <net/vnet.h>
77#include <sys/sysctl.h>
78
79uma_zone_t	sdp_zone;
80struct rwlock	sdp_lock;
81LIST_HEAD(, sdp_sock) sdp_list;
82
83struct workqueue_struct *rx_comp_wq;
84
85RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
86#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
87#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
88#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
89#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
90#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
91#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
92#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
93
94MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
95
96static void sdp_stop_keepalive_timer(struct socket *so);
97
98/*
99 * SDP protocol interface to socket abstraction.
100 */
101/*
102 * sdp_sendspace and sdp_recvspace are the default send and receive window
103 * sizes, respectively.
104 */
105u_long	sdp_sendspace = 1024*32;
106u_long	sdp_recvspace = 1024*64;
107
108static int sdp_count;
109
110/*
111 * Disable async. CMA events for sockets which are being torn down.
112 */
113static void
114sdp_destroy_cma(struct sdp_sock *ssk)
115{
116
117	if (ssk->id == NULL)
118		return;
119	rdma_destroy_id(ssk->id);
120	ssk->id = NULL;
121}
122
123static int
124sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
125{
126	struct sockaddr_in *sin;
127	struct sockaddr_in null;
128	int error;
129
130	SDP_WLOCK_ASSERT(ssk);
131
132	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
133		return (EINVAL);
134	/* rdma_bind_addr handles bind races.  */
135	SDP_WUNLOCK(ssk);
136	if (ssk->id == NULL)
137		ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
138	if (ssk->id == NULL) {
139		SDP_WLOCK(ssk);
140		return (ENOMEM);
141	}
142	if (nam == NULL) {
143		null.sin_family = AF_INET;
144		null.sin_len = sizeof(null);
145		null.sin_addr.s_addr = INADDR_ANY;
146		null.sin_port = 0;
147		bzero(&null.sin_zero, sizeof(null.sin_zero));
148		nam = (struct sockaddr *)&null;
149	}
150	error = -rdma_bind_addr(ssk->id, nam);
151	SDP_WLOCK(ssk);
152	if (error == 0) {
153		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
154		ssk->laddr = sin->sin_addr.s_addr;
155		ssk->lport = sin->sin_port;
156	} else
157		sdp_destroy_cma(ssk);
158	return (error);
159}
160
161static void
162sdp_pcbfree(struct sdp_sock *ssk)
163{
164
165	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
166	KASSERT((ssk->flags & SDP_DESTROY) == 0,
167	    ("ssk %p already destroyed", ssk));
168
169	sdp_dbg(ssk->socket, "Freeing pcb");
170	SDP_WLOCK_ASSERT(ssk);
171	ssk->flags |= SDP_DESTROY;
172	SDP_WUNLOCK(ssk);
173	SDP_LIST_WLOCK();
174	sdp_count--;
175	LIST_REMOVE(ssk, list);
176	SDP_LIST_WUNLOCK();
177	crfree(ssk->cred);
178	ssk->qp_active = 0;
179	if (ssk->qp) {
180		ib_destroy_qp(ssk->qp);
181		ssk->qp = NULL;
182	}
183	sdp_tx_ring_destroy(ssk);
184	sdp_rx_ring_destroy(ssk);
185	sdp_destroy_cma(ssk);
186	rw_destroy(&ssk->rx_ring.destroyed_lock);
187	rw_destroy(&ssk->lock);
188	uma_zfree(sdp_zone, ssk);
189}
190
191/*
192 * Common routines to return a socket address.
193 */
194static struct sockaddr *
195sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
196{
197	struct sockaddr_in *sin;
198
199	sin = malloc(sizeof *sin, M_SONAME,
200		M_WAITOK | M_ZERO);
201	sin->sin_family = AF_INET;
202	sin->sin_len = sizeof(*sin);
203	sin->sin_addr = *addr_p;
204	sin->sin_port = port;
205
206	return (struct sockaddr *)sin;
207}
208
209static int
210sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
211{
212	struct sdp_sock *ssk;
213	struct in_addr addr;
214	in_port_t port;
215
216	ssk = sdp_sk(so);
217	SDP_RLOCK(ssk);
218	port = ssk->lport;
219	addr.s_addr = ssk->laddr;
220	SDP_RUNLOCK(ssk);
221
222	*nam = sdp_sockaddr(port, &addr);
223	return 0;
224}
225
226static int
227sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
228{
229	struct sdp_sock *ssk;
230	struct in_addr addr;
231	in_port_t port;
232
233	ssk = sdp_sk(so);
234	SDP_RLOCK(ssk);
235	port = ssk->fport;
236	addr.s_addr = ssk->faddr;
237	SDP_RUNLOCK(ssk);
238
239	*nam = sdp_sockaddr(port, &addr);
240	return 0;
241}
242
243static void
244sdp_pcbnotifyall(struct in_addr faddr, int errno,
245    struct sdp_sock *(*notify)(struct sdp_sock *, int))
246{
247	struct sdp_sock *ssk, *ssk_temp;
248
249	SDP_LIST_WLOCK();
250	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
251		SDP_WLOCK(ssk);
252		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
253			SDP_WUNLOCK(ssk);
254			continue;
255		}
256		if ((ssk->flags & SDP_DESTROY) == 0)
257			if ((*notify)(ssk, errno))
258				SDP_WUNLOCK(ssk);
259	}
260	SDP_LIST_WUNLOCK();
261}
262
263#if 0
264static void
265sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
266{
267	struct sdp_sock *ssk;
268
269	SDP_LIST_RLOCK();
270	LIST_FOREACH(ssk, &sdp_list, list) {
271		SDP_WLOCK(ssk);
272		func(ssk, arg);
273		SDP_WUNLOCK(ssk);
274	}
275	SDP_LIST_RUNLOCK();
276}
277#endif
278
279static void
280sdp_output_reset(struct sdp_sock *ssk)
281{
282	struct rdma_cm_id *id;
283
284	SDP_WLOCK_ASSERT(ssk);
285	if (ssk->id) {
286		id = ssk->id;
287		ssk->qp_active = 0;
288		SDP_WUNLOCK(ssk);
289		rdma_disconnect(id);
290		SDP_WLOCK(ssk);
291	}
292	ssk->state = TCPS_CLOSED;
293}
294
295/*
296 * Attempt to close a SDP socket, marking it as dropped, and freeing
297 * the socket if we hold the only reference.
298 */
299static struct sdp_sock *
300sdp_closed(struct sdp_sock *ssk)
301{
302	struct socket *so;
303
304	SDP_WLOCK_ASSERT(ssk);
305
306	ssk->flags |= SDP_DROPPED;
307	so = ssk->socket;
308	soisdisconnected(so);
309	if (ssk->flags & SDP_SOCKREF) {
310		KASSERT(so->so_state & SS_PROTOREF,
311		    ("sdp_closed: !SS_PROTOREF"));
312		ssk->flags &= ~SDP_SOCKREF;
313		SDP_WUNLOCK(ssk);
314		SOCK_LOCK(so);
315		so->so_state &= ~SS_PROTOREF;
316		sofree(so);
317		return (NULL);
318	}
319	return (ssk);
320}
321
322/*
323 * Perform timer based shutdowns which can not operate in
324 * callout context.
325 */
326static void
327sdp_shutdown_task(void *data, int pending)
328{
329	struct sdp_sock *ssk;
330
331	ssk = data;
332	SDP_WLOCK(ssk);
333	/*
334	 * I don't think this can race with another call to pcbfree()
335	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
336	 */
337	if (ssk->flags & SDP_DESTROY)
338		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
339		    ssk);
340	if (ssk->flags & SDP_DISCON)
341		sdp_output_reset(ssk);
342	/* We have to clear this so sdp_detach() will call pcbfree(). */
343	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
344	if ((ssk->flags & SDP_DROPPED) == 0 &&
345	    sdp_closed(ssk) == NULL)
346		return;
347	if (ssk->socket == NULL) {
348		sdp_pcbfree(ssk);
349		return;
350	}
351	SDP_WUNLOCK(ssk);
352}
353
354/*
355 * 2msl has expired, schedule the shutdown task.
356 */
357static void
358sdp_2msl_timeout(void *data)
359{
360	struct sdp_sock *ssk;
361
362	ssk = data;
363	/* Callout canceled. */
364        if (!callout_active(&ssk->keep2msl))
365		goto out;
366        callout_deactivate(&ssk->keep2msl);
367	/* Should be impossible, defensive programming. */
368	if ((ssk->flags & SDP_TIMEWAIT) == 0)
369		goto out;
370	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
371out:
372	SDP_WUNLOCK(ssk);
373	return;
374}
375
376/*
377 * Schedule the 2msl wait timer.
378 */
379static void
380sdp_2msl_wait(struct sdp_sock *ssk)
381{
382
383	SDP_WLOCK_ASSERT(ssk);
384	ssk->flags |= SDP_TIMEWAIT;
385	ssk->state = TCPS_TIME_WAIT;
386	soisdisconnected(ssk->socket);
387	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
388}
389
390/*
391 * Timed out waiting for the final fin/ack from rdma_disconnect().
392 */
393static void
394sdp_dreq_timeout(void *data)
395{
396	struct sdp_sock *ssk;
397
398	ssk = data;
399	/* Callout canceled. */
400        if (!callout_active(&ssk->keep2msl))
401		goto out;
402	/* Callout rescheduled, probably as a different timer. */
403	if (callout_pending(&ssk->keep2msl))
404		goto out;
405        callout_deactivate(&ssk->keep2msl);
406	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
407		goto out;
408	if ((ssk->flags & SDP_DREQWAIT) == 0)
409		goto out;
410	ssk->flags &= ~SDP_DREQWAIT;
411	ssk->flags |= SDP_DISCON;
412	sdp_2msl_wait(ssk);
413	ssk->qp_active = 0;
414out:
415	SDP_WUNLOCK(ssk);
416}
417
418/*
419 * Received the final fin/ack.  Cancel the 2msl.
420 */
421void
422sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
423{
424	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
425	ssk->flags &= ~SDP_DREQWAIT;
426	sdp_2msl_wait(ssk);
427}
428
429static int
430sdp_init_sock(struct socket *sk)
431{
432	struct sdp_sock *ssk = sdp_sk(sk);
433
434	sdp_dbg(sk, "%s\n", __func__);
435
436	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
437	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
438#ifdef SDP_ZCOPY
439	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
440	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
441	ssk->tx_ring.rdma_inflight = NULL;
442#endif
443	atomic_set(&ssk->mseq_ack, 0);
444	sdp_rx_ring_init(ssk);
445	ssk->tx_ring.buffer = NULL;
446
447	return 0;
448}
449
450/*
451 * Allocate an sdp_sock for the socket and reserve socket buffer space.
452 */
453static int
454sdp_attach(struct socket *so, int proto, struct thread *td)
455{
456	struct sdp_sock *ssk;
457	int error;
458
459	ssk = sdp_sk(so);
460	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
461	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
462		error = soreserve(so, sdp_sendspace, sdp_recvspace);
463		if (error)
464			return (error);
465	}
466	so->so_rcv.sb_flags |= SB_AUTOSIZE;
467	so->so_snd.sb_flags |= SB_AUTOSIZE;
468	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
469	if (ssk == NULL)
470		return (ENOBUFS);
471	rw_init(&ssk->lock, "sdpsock");
472	ssk->socket = so;
473	ssk->cred = crhold(so->so_cred);
474	so->so_pcb = (caddr_t)ssk;
475	sdp_init_sock(so);
476	ssk->flags = 0;
477	ssk->qp_active = 0;
478	ssk->state = TCPS_CLOSED;
479	mbufq_init(&ssk->rxctlq, INT_MAX);
480	SDP_LIST_WLOCK();
481	LIST_INSERT_HEAD(&sdp_list, ssk, list);
482	sdp_count++;
483	SDP_LIST_WUNLOCK();
484
485	return (0);
486}
487
488/*
489 * Detach SDP from the socket, potentially leaving it around for the
490 * timewait to expire.
491 */
492static void
493sdp_detach(struct socket *so)
494{
495	struct sdp_sock *ssk;
496
497	ssk = sdp_sk(so);
498	SDP_WLOCK(ssk);
499	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
500	ssk->socket->so_pcb = NULL;
501	ssk->socket = NULL;
502	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
503		SDP_WUNLOCK(ssk);
504	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
505		sdp_pcbfree(ssk);
506	else
507		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
508}
509
510/*
511 * Allocate a local address for the socket.
512 */
513static int
514sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
515{
516	int error = 0;
517	struct sdp_sock *ssk;
518	struct sockaddr_in *sin;
519
520	sin = (struct sockaddr_in *)nam;
521	if (nam->sa_len != sizeof (*sin))
522		return (EINVAL);
523	if (sin->sin_family != AF_INET)
524		return (EINVAL);
525	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
526		return (EAFNOSUPPORT);
527
528	ssk = sdp_sk(so);
529	SDP_WLOCK(ssk);
530	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
531		error = EINVAL;
532		goto out;
533	}
534	error = sdp_pcbbind(ssk, nam, td->td_ucred);
535out:
536	SDP_WUNLOCK(ssk);
537
538	return (error);
539}
540
541/*
542 * Prepare to accept connections.
543 */
544static int
545sdp_listen(struct socket *so, int backlog, struct thread *td)
546{
547	int error = 0;
548	struct sdp_sock *ssk;
549
550	ssk = sdp_sk(so);
551	SDP_WLOCK(ssk);
552	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
553		error = EINVAL;
554		goto out;
555	}
556	if (error == 0 && ssk->lport == 0)
557		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
558	SOCK_LOCK(so);
559	if (error == 0)
560		error = solisten_proto_check(so);
561	if (error == 0) {
562		solisten_proto(so, backlog);
563		ssk->state = TCPS_LISTEN;
564	}
565	SOCK_UNLOCK(so);
566
567out:
568	SDP_WUNLOCK(ssk);
569	if (error == 0)
570		error = -rdma_listen(ssk->id, backlog);
571	return (error);
572}
573
574/*
575 * Initiate a SDP connection to nam.
576 */
577static int
578sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
579{
580	struct sockaddr_in src;
581	struct socket *so;
582	int error;
583
584	so = ssk->socket;
585
586	SDP_WLOCK_ASSERT(ssk);
587	if (ssk->lport == 0) {
588		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
589		if (error)
590			return error;
591	}
592	src.sin_family = AF_INET;
593	src.sin_len = sizeof(src);
594	bzero(&src.sin_zero, sizeof(src.sin_zero));
595	src.sin_port = ssk->lport;
596	src.sin_addr.s_addr = ssk->laddr;
597	soisconnecting(so);
598	SDP_WUNLOCK(ssk);
599	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
600	    SDP_RESOLVE_TIMEOUT);
601	SDP_WLOCK(ssk);
602	if (error == 0)
603		ssk->state = TCPS_SYN_SENT;
604
605	return 0;
606}
607
608/*
609 * Initiate SDP connection.
610 */
611static int
612sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
613{
614	int error = 0;
615	struct sdp_sock *ssk;
616	struct sockaddr_in *sin;
617
618	sin = (struct sockaddr_in *)nam;
619	if (nam->sa_len != sizeof (*sin))
620		return (EINVAL);
621	if (sin->sin_family != AF_INET)
622		return (EINVAL);
623	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
624		return (EAFNOSUPPORT);
625	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
626		return (error);
627	ssk = sdp_sk(so);
628	SDP_WLOCK(ssk);
629	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
630		error = EINVAL;
631	else
632		error = sdp_start_connect(ssk, nam, td);
633	SDP_WUNLOCK(ssk);
634	return (error);
635}
636
637/*
638 * Drop a SDP socket, reporting
639 * the specified error.  If connection is synchronized,
640 * then send a RST to peer.
641 */
642static struct sdp_sock *
643sdp_drop(struct sdp_sock *ssk, int errno)
644{
645	struct socket *so;
646
647	SDP_WLOCK_ASSERT(ssk);
648	so = ssk->socket;
649	if (TCPS_HAVERCVDSYN(ssk->state))
650		sdp_output_reset(ssk);
651	if (errno == ETIMEDOUT && ssk->softerror)
652		errno = ssk->softerror;
653	so->so_error = errno;
654	return (sdp_closed(ssk));
655}
656
657/*
658 * User issued close, and wish to trail through shutdown states:
659 * if never received SYN, just forget it.  If got a SYN from peer,
660 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
661 * If already got a FIN from peer, then almost done; go to LAST_ACK
662 * state.  In all other cases, have already sent FIN to peer (e.g.
663 * after PRU_SHUTDOWN), and just have to play tedious game waiting
664 * for peer to send FIN or not respond to keep-alives, etc.
665 * We can let the user exit from the close as soon as the FIN is acked.
666 */
667static void
668sdp_usrclosed(struct sdp_sock *ssk)
669{
670
671	SDP_WLOCK_ASSERT(ssk);
672
673	switch (ssk->state) {
674	case TCPS_LISTEN:
675		ssk->state = TCPS_CLOSED;
676		SDP_WUNLOCK(ssk);
677		sdp_destroy_cma(ssk);
678		SDP_WLOCK(ssk);
679		/* FALLTHROUGH */
680	case TCPS_CLOSED:
681		ssk = sdp_closed(ssk);
682		/*
683		 * sdp_closed() should never return NULL here as the socket is
684		 * still open.
685		 */
686		KASSERT(ssk != NULL,
687		    ("sdp_usrclosed: sdp_closed() returned NULL"));
688		break;
689
690	case TCPS_SYN_SENT:
691		/* FALLTHROUGH */
692	case TCPS_SYN_RECEIVED:
693		ssk->flags |= SDP_NEEDFIN;
694		break;
695
696	case TCPS_ESTABLISHED:
697		ssk->flags |= SDP_NEEDFIN;
698		ssk->state = TCPS_FIN_WAIT_1;
699		break;
700
701	case TCPS_CLOSE_WAIT:
702		ssk->state = TCPS_LAST_ACK;
703		break;
704	}
705	if (ssk->state >= TCPS_FIN_WAIT_2) {
706		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
707		if (ssk->state == TCPS_FIN_WAIT_2)
708			sdp_2msl_wait(ssk);
709		else
710			soisdisconnected(ssk->socket);
711	}
712}
713
714static void
715sdp_output_disconnect(struct sdp_sock *ssk)
716{
717
718	SDP_WLOCK_ASSERT(ssk);
719	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
720	    sdp_dreq_timeout, ssk);
721	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
722	sdp_post_sends(ssk, M_NOWAIT);
723}
724
725/*
726 * Initiate or continue a disconnect.
727 * If embryonic state, just send reset (once).
728 * If in ``let data drain'' option and linger null, just drop.
729 * Otherwise (hard), mark socket disconnecting and drop
730 * current input data; switch states based on user close, and
731 * send segment to peer (with FIN).
732 */
733static void
734sdp_start_disconnect(struct sdp_sock *ssk)
735{
736	struct socket *so;
737	int unread;
738
739	so = ssk->socket;
740	SDP_WLOCK_ASSERT(ssk);
741	sdp_stop_keepalive_timer(so);
742	/*
743	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
744	 * socket is still open.
745	 */
746	if (ssk->state < TCPS_ESTABLISHED) {
747		ssk = sdp_closed(ssk);
748		KASSERT(ssk != NULL,
749		    ("sdp_start_disconnect: sdp_close() returned NULL"));
750	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
751		ssk = sdp_drop(ssk, 0);
752		KASSERT(ssk != NULL,
753		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
754	} else {
755		soisdisconnecting(so);
756		unread = sbused(&so->so_rcv);
757		sbflush(&so->so_rcv);
758		sdp_usrclosed(ssk);
759		if (!(ssk->flags & SDP_DROPPED)) {
760			if (unread)
761				sdp_output_reset(ssk);
762			else
763				sdp_output_disconnect(ssk);
764		}
765	}
766}
767
768/*
769 * User initiated disconnect.
770 */
771static int
772sdp_disconnect(struct socket *so)
773{
774	struct sdp_sock *ssk;
775	int error = 0;
776
777	ssk = sdp_sk(so);
778	SDP_WLOCK(ssk);
779	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
780		error = ECONNRESET;
781		goto out;
782	}
783	sdp_start_disconnect(ssk);
784out:
785	SDP_WUNLOCK(ssk);
786	return (error);
787}
788
789/*
790 * Accept a connection.  Essentially all the work is done at higher levels;
791 * just return the address of the peer, storing through addr.
792 *
793 *
794 * XXX This is broken XXX
795 *
796 * The rationale for acquiring the sdp lock here is somewhat complicated,
797 * and is described in detail in the commit log entry for r175612.  Acquiring
798 * it delays an accept(2) racing with sonewconn(), which inserts the socket
799 * before the address/port fields are initialized.  A better fix would
800 * prevent the socket from being placed in the listen queue until all fields
801 * are fully initialized.
802 */
803static int
804sdp_accept(struct socket *so, struct sockaddr **nam)
805{
806	struct sdp_sock *ssk = NULL;
807	struct in_addr addr;
808	in_port_t port;
809	int error;
810
811	if (so->so_state & SS_ISDISCONNECTED)
812		return (ECONNABORTED);
813
814	port = 0;
815	addr.s_addr = 0;
816	error = 0;
817	ssk = sdp_sk(so);
818	SDP_WLOCK(ssk);
819	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
820		error = ECONNABORTED;
821		goto out;
822	}
823	port = ssk->fport;
824	addr.s_addr = ssk->faddr;
825out:
826	SDP_WUNLOCK(ssk);
827	if (error == 0)
828		*nam = sdp_sockaddr(port, &addr);
829	return error;
830}
831
832/*
833 * Mark the connection as being incapable of further output.
834 */
835static int
836sdp_shutdown(struct socket *so)
837{
838	int error = 0;
839	struct sdp_sock *ssk;
840
841	ssk = sdp_sk(so);
842	SDP_WLOCK(ssk);
843	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
844		error = ECONNRESET;
845		goto out;
846	}
847	socantsendmore(so);
848	sdp_usrclosed(ssk);
849	if (!(ssk->flags & SDP_DROPPED))
850		sdp_output_disconnect(ssk);
851
852out:
853	SDP_WUNLOCK(ssk);
854
855	return (error);
856}
857
858static void
859sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
860{
861	struct mbuf *n;
862	int ncnt;
863
864	SOCKBUF_LOCK_ASSERT(sb);
865	SBLASTRECORDCHK(sb);
866	KASSERT(mb->m_flags & M_PKTHDR,
867		("sdp_append: %p Missing packet header.\n", mb));
868	n = sb->sb_lastrecord;
869	/*
870	 * If the queue is empty just set all pointers and proceed.
871	 */
872	if (n == NULL) {
873		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
874		for (; mb; mb = mb->m_next) {
875	                sb->sb_mbtail = mb;
876			sballoc(sb, mb);
877		}
878		return;
879	}
880	/*
881	 * Count the number of mbufs in the current tail.
882	 */
883	for (ncnt = 0; n->m_next; n = n->m_next)
884		ncnt++;
885	n = sb->sb_lastrecord;
886	/*
887	 * If the two chains can fit in a single sdp packet and
888	 * the last record has not been sent yet (WRITABLE) coalesce
889	 * them.  The lastrecord remains the same but we must strip the
890	 * packet header and then let sbcompress do the hard part.
891	 */
892	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
893	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
894	    ssk->xmit_size_goal) {
895		m_adj(mb, SDP_HEAD_SIZE);
896		n->m_pkthdr.len += mb->m_pkthdr.len;
897		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
898		m_demote(mb, 1, 0);
899		sbcompress(sb, mb, sb->sb_mbtail);
900		return;
901	}
902	/*
903	 * Not compressible, just append to the end and adjust counters.
904	 */
905	sb->sb_lastrecord->m_flags |= M_PUSH;
906	sb->sb_lastrecord->m_nextpkt = mb;
907	sb->sb_lastrecord = mb;
908	if (sb->sb_sndptr == NULL)
909		sb->sb_sndptr = mb;
910	for (; mb; mb = mb->m_next) {
911		sb->sb_mbtail = mb;
912		sballoc(sb, mb);
913	}
914}
915
916/*
917 * Do a send by putting data in output queue and updating urgent
918 * marker if URG set.  Possibly send more data.  Unlike the other
919 * pru_*() routines, the mbuf chains are our responsibility.  We
920 * must either enqueue them or free them.  The other pru_* routines
921 * generally are caller-frees.
922 *
923 * This comes from sendfile, normal sends will come from sdp_sosend().
924 */
925static int
926sdp_send(struct socket *so, int flags, struct mbuf *m,
927    struct sockaddr *nam, struct mbuf *control, struct thread *td)
928{
929	struct sdp_sock *ssk;
930	struct mbuf *n;
931	int error;
932	int cnt;
933
934	error = 0;
935	ssk = sdp_sk(so);
936	KASSERT(m->m_flags & M_PKTHDR,
937	    ("sdp_send: %p no packet header", m));
938	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
939	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
940	for (n = m, cnt = 0; n->m_next; n = n->m_next)
941		cnt++;
942	if (cnt > SDP_MAX_SEND_SGES) {
943		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
944		if (n == NULL) {
945			m_freem(m);
946			return (EMSGSIZE);
947		}
948		m = n;
949		for (cnt = 0; n->m_next; n = n->m_next)
950			cnt++;
951	}
952	SDP_WLOCK(ssk);
953	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
954		if (control)
955			m_freem(control);
956		if (m)
957			m_freem(m);
958		error = ECONNRESET;
959		goto out;
960	}
961	if (control) {
962		/* SDP doesn't support control messages. */
963		if (control->m_len) {
964			m_freem(control);
965			if (m)
966				m_freem(m);
967			error = EINVAL;
968			goto out;
969		}
970		m_freem(control);	/* empty control, just free it */
971	}
972	if (!(flags & PRUS_OOB)) {
973		SOCKBUF_LOCK(&so->so_snd);
974		sdp_append(ssk, &so->so_snd, m, cnt);
975		SOCKBUF_UNLOCK(&so->so_snd);
976		if (nam && ssk->state < TCPS_SYN_SENT) {
977			/*
978			 * Do implied connect if not yet connected.
979			 */
980			error = sdp_start_connect(ssk, nam, td);
981			if (error)
982				goto out;
983		}
984		if (flags & PRUS_EOF) {
985			/*
986			 * Close the send side of the connection after
987			 * the data is sent.
988			 */
989			socantsendmore(so);
990			sdp_usrclosed(ssk);
991			if (!(ssk->flags & SDP_DROPPED))
992				sdp_output_disconnect(ssk);
993		} else if (!(ssk->flags & SDP_DROPPED) &&
994		    !(flags & PRUS_MORETOCOME))
995			sdp_post_sends(ssk, M_NOWAIT);
996		SDP_WUNLOCK(ssk);
997		return (0);
998	} else {
999		SOCKBUF_LOCK(&so->so_snd);
1000		if (sbspace(&so->so_snd) < -512) {
1001			SOCKBUF_UNLOCK(&so->so_snd);
1002			m_freem(m);
1003			error = ENOBUFS;
1004			goto out;
1005		}
1006		/*
1007		 * According to RFC961 (Assigned Protocols),
1008		 * the urgent pointer points to the last octet
1009		 * of urgent data.  We continue, however,
1010		 * to consider it to indicate the first octet
1011		 * of data past the urgent section.
1012		 * Otherwise, snd_up should be one lower.
1013		 */
1014		m->m_flags |= M_URG | M_PUSH;
1015		sdp_append(ssk, &so->so_snd, m, cnt);
1016		SOCKBUF_UNLOCK(&so->so_snd);
1017		if (nam && ssk->state < TCPS_SYN_SENT) {
1018			/*
1019			 * Do implied connect if not yet connected.
1020			 */
1021			error = sdp_start_connect(ssk, nam, td);
1022			if (error)
1023				goto out;
1024		}
1025		sdp_post_sends(ssk, M_NOWAIT);
1026		SDP_WUNLOCK(ssk);
1027		return (0);
1028	}
1029out:
1030	SDP_WUNLOCK(ssk);
1031	return (error);
1032}
1033
1034#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1035
1036/*
1037 * Send on a socket.  If send must go all at once and message is larger than
1038 * send buffering, then hard error.  Lock against other senders.  If must go
1039 * all at once and not enough room now, then inform user that this would
1040 * block and do nothing.  Otherwise, if nonblocking, send as much as
1041 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1042 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1043 * in mbuf chain must be small enough to send all at once.
1044 *
1045 * Returns nonzero on error, timeout or signal; callers must check for short
1046 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1047 * on return.
1048 */
1049static int
1050sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1051    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1052{
1053	struct sdp_sock *ssk;
1054	long space, resid;
1055	int atomic;
1056	int error;
1057	int copy;
1058
1059	if (uio != NULL)
1060		resid = uio->uio_resid;
1061	else
1062		resid = top->m_pkthdr.len;
1063	atomic = top != NULL;
1064	if (control != NULL) {
1065		if (control->m_len) {
1066			m_freem(control);
1067			if (top)
1068				m_freem(top);
1069			return (EINVAL);
1070		}
1071		m_freem(control);
1072		control = NULL;
1073	}
1074	/*
1075	 * In theory resid should be unsigned.  However, space must be
1076	 * signed, as it might be less than 0 if we over-committed, and we
1077	 * must use a signed comparison of space and resid.  On the other
1078	 * hand, a negative resid causes us to loop sending 0-length
1079	 * segments to the protocol.
1080	 *
1081	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1082	 * type sockets since that's an error.
1083	 */
1084	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1085		error = EINVAL;
1086		goto out;
1087	}
1088	if (td != NULL)
1089		td->td_ru.ru_msgsnd++;
1090
1091	ssk = sdp_sk(so);
1092	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1093	if (error)
1094		goto out;
1095
1096restart:
1097	do {
1098		SOCKBUF_LOCK(&so->so_snd);
1099		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1100			SOCKBUF_UNLOCK(&so->so_snd);
1101			error = EPIPE;
1102			goto release;
1103		}
1104		if (so->so_error) {
1105			error = so->so_error;
1106			so->so_error = 0;
1107			SOCKBUF_UNLOCK(&so->so_snd);
1108			goto release;
1109		}
1110		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1111			SOCKBUF_UNLOCK(&so->so_snd);
1112			error = ENOTCONN;
1113			goto release;
1114		}
1115		space = sbspace(&so->so_snd);
1116		if (flags & MSG_OOB)
1117			space += 1024;
1118		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1119			SOCKBUF_UNLOCK(&so->so_snd);
1120			error = EMSGSIZE;
1121			goto release;
1122		}
1123		if (space < resid &&
1124		    (atomic || space < so->so_snd.sb_lowat)) {
1125			if ((so->so_state & SS_NBIO) ||
1126			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1127				SOCKBUF_UNLOCK(&so->so_snd);
1128				error = EWOULDBLOCK;
1129				goto release;
1130			}
1131			error = sbwait(&so->so_snd);
1132			SOCKBUF_UNLOCK(&so->so_snd);
1133			if (error)
1134				goto release;
1135			goto restart;
1136		}
1137		SOCKBUF_UNLOCK(&so->so_snd);
1138		do {
1139			if (uio == NULL) {
1140				resid = 0;
1141				if (flags & MSG_EOR)
1142					top->m_flags |= M_EOR;
1143			} else {
1144				/*
1145				 * Copy the data from userland into a mbuf
1146				 * chain.  If no data is to be copied in,
1147				 * a single empty mbuf is returned.
1148				 */
1149				copy = min(space,
1150				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1151				top = m_uiotombuf(uio, M_WAITOK, copy,
1152				    0, M_PKTHDR |
1153				    ((flags & MSG_EOR) ? M_EOR : 0));
1154				if (top == NULL) {
1155					/* only possible error */
1156					error = EFAULT;
1157					goto release;
1158				}
1159				space -= resid - uio->uio_resid;
1160				resid = uio->uio_resid;
1161			}
1162			/*
1163			 * XXX all the SBS_CANTSENDMORE checks previously
1164			 * done could be out of date after dropping the
1165			 * socket lock.
1166			 */
1167			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1168			/*
1169			 * Set EOF on the last send if the user specified
1170			 * MSG_EOF.
1171			 */
1172			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1173			/* If there is more to send set PRUS_MORETOCOME. */
1174			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1175			    top, addr, NULL, td);
1176			top = NULL;
1177			if (error)
1178				goto release;
1179		} while (resid && space > 0);
1180	} while (resid);
1181
1182release:
1183	sbunlock(&so->so_snd);
1184out:
1185	if (top != NULL)
1186		m_freem(top);
1187	return (error);
1188}
1189
1190/*
1191 * The part of soreceive() that implements reading non-inline out-of-band
1192 * data from a socket.  For more complete comments, see soreceive(), from
1193 * which this code originated.
1194 *
1195 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1196 * unable to return an mbuf chain to the caller.
1197 */
1198static int
1199soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1200{
1201	struct protosw *pr = so->so_proto;
1202	struct mbuf *m;
1203	int error;
1204
1205	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1206
1207	m = m_get(M_WAITOK, MT_DATA);
1208	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1209	if (error)
1210		goto bad;
1211	do {
1212		error = uiomove(mtod(m, void *),
1213		    (int) min(uio->uio_resid, m->m_len), uio);
1214		m = m_free(m);
1215	} while (uio->uio_resid && error == 0 && m);
1216bad:
1217	if (m != NULL)
1218		m_freem(m);
1219	return (error);
1220}
1221
1222/*
1223 * Optimized version of soreceive() for stream (TCP) sockets.
1224 */
1225static int
1226sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1227    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1228{
1229	int len = 0, error = 0, flags, oresid;
1230	struct sockbuf *sb;
1231	struct mbuf *m, *n = NULL;
1232	struct sdp_sock *ssk;
1233
1234	/* We only do stream sockets. */
1235	if (so->so_type != SOCK_STREAM)
1236		return (EINVAL);
1237	if (psa != NULL)
1238		*psa = NULL;
1239	if (controlp != NULL)
1240		return (EINVAL);
1241	if (flagsp != NULL)
1242		flags = *flagsp &~ MSG_EOR;
1243	else
1244		flags = 0;
1245	if (flags & MSG_OOB)
1246		return (soreceive_rcvoob(so, uio, flags));
1247	if (mp0 != NULL)
1248		*mp0 = NULL;
1249
1250	sb = &so->so_rcv;
1251	ssk = sdp_sk(so);
1252
1253	/* Prevent other readers from entering the socket. */
1254	error = sblock(sb, SBLOCKWAIT(flags));
1255	if (error)
1256		goto out;
1257	SOCKBUF_LOCK(sb);
1258
1259	/* Easy one, no space to copyout anything. */
1260	if (uio->uio_resid == 0) {
1261		error = EINVAL;
1262		goto out;
1263	}
1264	oresid = uio->uio_resid;
1265
1266	/* We will never ever get anything unless we are connected. */
1267	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1268		/* When disconnecting there may be still some data left. */
1269		if (sbavail(sb))
1270			goto deliver;
1271		if (!(so->so_state & SS_ISDISCONNECTED))
1272			error = ENOTCONN;
1273		goto out;
1274	}
1275
1276	/* Socket buffer is empty and we shall not block. */
1277	if (sbavail(sb) == 0 &&
1278	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1279		error = EAGAIN;
1280		goto out;
1281	}
1282
1283restart:
1284	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1285
1286	/* Abort if socket has reported problems. */
1287	if (so->so_error) {
1288		if (sbavail(sb))
1289			goto deliver;
1290		if (oresid > uio->uio_resid)
1291			goto out;
1292		error = so->so_error;
1293		if (!(flags & MSG_PEEK))
1294			so->so_error = 0;
1295		goto out;
1296	}
1297
1298	/* Door is closed.  Deliver what is left, if any. */
1299	if (sb->sb_state & SBS_CANTRCVMORE) {
1300		if (sbavail(sb))
1301			goto deliver;
1302		else
1303			goto out;
1304	}
1305
1306	/* Socket buffer got some data that we shall deliver now. */
1307	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1308	    ((so->so_state & SS_NBIO) ||
1309	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1310	     sbavail(sb) >= sb->sb_lowat ||
1311	     sbavail(sb) >= uio->uio_resid ||
1312	     sbavail(sb) >= sb->sb_hiwat) ) {
1313		goto deliver;
1314	}
1315
1316	/* On MSG_WAITALL we must wait until all data or error arrives. */
1317	if ((flags & MSG_WAITALL) &&
1318	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1319		goto deliver;
1320
1321	/*
1322	 * Wait and block until (more) data comes in.
1323	 * NB: Drops the sockbuf lock during wait.
1324	 */
1325	error = sbwait(sb);
1326	if (error)
1327		goto out;
1328	goto restart;
1329
1330deliver:
1331	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1332	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1333	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1334
1335	/* Statistics. */
1336	if (uio->uio_td)
1337		uio->uio_td->td_ru.ru_msgrcv++;
1338
1339	/* Fill uio until full or current end of socket buffer is reached. */
1340	len = min(uio->uio_resid, sbavail(sb));
1341	if (mp0 != NULL) {
1342		/* Dequeue as many mbufs as possible. */
1343		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1344			for (*mp0 = m = sb->sb_mb;
1345			     m != NULL && m->m_len <= len;
1346			     m = m->m_next) {
1347				len -= m->m_len;
1348				uio->uio_resid -= m->m_len;
1349				sbfree(sb, m);
1350				n = m;
1351			}
1352			sb->sb_mb = m;
1353			if (sb->sb_mb == NULL)
1354				SB_EMPTY_FIXUP(sb);
1355			n->m_next = NULL;
1356		}
1357		/* Copy the remainder. */
1358		if (len > 0) {
1359			KASSERT(sb->sb_mb != NULL,
1360			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1361
1362			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1363			if (m == NULL)
1364				len = 0;	/* Don't flush data from sockbuf. */
1365			else
1366				uio->uio_resid -= m->m_len;
1367			if (*mp0 != NULL)
1368				n->m_next = m;
1369			else
1370				*mp0 = m;
1371			if (*mp0 == NULL) {
1372				error = ENOBUFS;
1373				goto out;
1374			}
1375		}
1376	} else {
1377		/* NB: Must unlock socket buffer as uiomove may sleep. */
1378		SOCKBUF_UNLOCK(sb);
1379		error = m_mbuftouio(uio, sb->sb_mb, len);
1380		SOCKBUF_LOCK(sb);
1381		if (error)
1382			goto out;
1383	}
1384	SBLASTRECORDCHK(sb);
1385	SBLASTMBUFCHK(sb);
1386
1387	/*
1388	 * Remove the delivered data from the socket buffer unless we
1389	 * were only peeking.
1390	 */
1391	if (!(flags & MSG_PEEK)) {
1392		if (len > 0)
1393			sbdrop_locked(sb, len);
1394
1395		/* Notify protocol that we drained some data. */
1396		SOCKBUF_UNLOCK(sb);
1397		SDP_WLOCK(ssk);
1398		sdp_do_posts(ssk);
1399		SDP_WUNLOCK(ssk);
1400		SOCKBUF_LOCK(sb);
1401	}
1402
1403	/*
1404	 * For MSG_WAITALL we may have to loop again and wait for
1405	 * more data to come in.
1406	 */
1407	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1408		goto restart;
1409out:
1410	SOCKBUF_LOCK_ASSERT(sb);
1411	SBLASTRECORDCHK(sb);
1412	SBLASTMBUFCHK(sb);
1413	SOCKBUF_UNLOCK(sb);
1414	sbunlock(sb);
1415	return (error);
1416}
1417
1418/*
1419 * Abort is used to teardown a connection typically while sitting in
1420 * the accept queue.
1421 */
1422void
1423sdp_abort(struct socket *so)
1424{
1425	struct sdp_sock *ssk;
1426
1427	ssk = sdp_sk(so);
1428	SDP_WLOCK(ssk);
1429	/*
1430	 * If we have not yet dropped, do it now.
1431	 */
1432	if (!(ssk->flags & SDP_TIMEWAIT) &&
1433	    !(ssk->flags & SDP_DROPPED))
1434		sdp_drop(ssk, ECONNABORTED);
1435	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1436	    ssk, ssk->flags));
1437	SDP_WUNLOCK(ssk);
1438}
1439
1440/*
1441 * Close a SDP socket and initiate a friendly disconnect.
1442 */
1443static void
1444sdp_close(struct socket *so)
1445{
1446	struct sdp_sock *ssk;
1447
1448	ssk = sdp_sk(so);
1449	SDP_WLOCK(ssk);
1450	/*
1451	 * If we have not yet dropped, do it now.
1452	 */
1453	if (!(ssk->flags & SDP_TIMEWAIT) &&
1454	    !(ssk->flags & SDP_DROPPED))
1455		sdp_start_disconnect(ssk);
1456
1457	/*
1458	 * If we've still not dropped let the socket layer know we're
1459	 * holding on to the socket and pcb for a while.
1460	 */
1461	if (!(ssk->flags & SDP_DROPPED)) {
1462		SOCK_LOCK(so);
1463		so->so_state |= SS_PROTOREF;
1464		SOCK_UNLOCK(so);
1465		ssk->flags |= SDP_SOCKREF;
1466	}
1467	SDP_WUNLOCK(ssk);
1468}
1469
1470/*
1471 * User requests out-of-band data.
1472 */
1473static int
1474sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1475{
1476	int error = 0;
1477	struct sdp_sock *ssk;
1478
1479	ssk = sdp_sk(so);
1480	SDP_WLOCK(ssk);
1481	if (!rx_ring_trylock(&ssk->rx_ring)) {
1482		SDP_WUNLOCK(ssk);
1483		return (ECONNRESET);
1484	}
1485	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1486		error = ECONNRESET;
1487		goto out;
1488	}
1489	if ((so->so_oobmark == 0 &&
1490	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1491	    so->so_options & SO_OOBINLINE ||
1492	    ssk->oobflags & SDP_HADOOB) {
1493		error = EINVAL;
1494		goto out;
1495	}
1496	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1497		error = EWOULDBLOCK;
1498		goto out;
1499	}
1500	m->m_len = 1;
1501	*mtod(m, caddr_t) = ssk->iobc;
1502	if ((flags & MSG_PEEK) == 0)
1503		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1504out:
1505	rx_ring_unlock(&ssk->rx_ring);
1506	SDP_WUNLOCK(ssk);
1507	return (error);
1508}
1509
1510void
1511sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1512{
1513	struct mbuf *m;
1514	struct socket *so;
1515
1516	so = ssk->socket;
1517	if (so == NULL)
1518		return;
1519
1520	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1521	sohasoutofband(so);
1522	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1523	if (!(so->so_options & SO_OOBINLINE)) {
1524		for (m = mb; m->m_next != NULL; m = m->m_next);
1525		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1526		ssk->oobflags |= SDP_HAVEOOB;
1527		m->m_len--;
1528		mb->m_pkthdr.len--;
1529	}
1530}
1531
1532/*
1533 * Notify a sdp socket of an asynchronous error.
1534 *
1535 * Do not wake up user since there currently is no mechanism for
1536 * reporting soft errors (yet - a kqueue filter may be added).
1537 */
1538struct sdp_sock *
1539sdp_notify(struct sdp_sock *ssk, int error)
1540{
1541
1542	SDP_WLOCK_ASSERT(ssk);
1543
1544	if ((ssk->flags & SDP_TIMEWAIT) ||
1545	    (ssk->flags & SDP_DROPPED))
1546		return (ssk);
1547
1548	/*
1549	 * Ignore some errors if we are hooked up.
1550	 */
1551	if (ssk->state == TCPS_ESTABLISHED &&
1552	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1553	     error == EHOSTDOWN))
1554		return (ssk);
1555	ssk->softerror = error;
1556	return sdp_drop(ssk, error);
1557}
1558
1559static void
1560sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1561{
1562	struct in_addr faddr;
1563
1564	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1565	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1566		return;
1567
1568	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1569}
1570
1571static int
1572sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1573    struct thread *td)
1574{
1575	return (EOPNOTSUPP);
1576}
1577
1578static void
1579sdp_keepalive_timeout(void *data)
1580{
1581	struct sdp_sock *ssk;
1582
1583	ssk = data;
1584	/* Callout canceled. */
1585        if (!callout_active(&ssk->keep2msl))
1586                return;
1587	/* Callout rescheduled as a different kind of timer. */
1588	if (callout_pending(&ssk->keep2msl))
1589		goto out;
1590        callout_deactivate(&ssk->keep2msl);
1591	if (ssk->flags & SDP_DROPPED ||
1592	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1593		goto out;
1594	sdp_post_keepalive(ssk);
1595	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1596	    sdp_keepalive_timeout, ssk);
1597out:
1598	SDP_WUNLOCK(ssk);
1599}
1600
1601
1602void
1603sdp_start_keepalive_timer(struct socket *so)
1604{
1605	struct sdp_sock *ssk;
1606
1607	ssk = sdp_sk(so);
1608	if (!callout_pending(&ssk->keep2msl))
1609                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1610                    sdp_keepalive_timeout, ssk);
1611}
1612
1613static void
1614sdp_stop_keepalive_timer(struct socket *so)
1615{
1616	struct sdp_sock *ssk;
1617
1618	ssk = sdp_sk(so);
1619	callout_stop(&ssk->keep2msl);
1620}
1621
1622/*
1623 * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1624 * socket option arguments.  When it re-acquires the lock after the copy, it
1625 * has to revalidate that the connection is still valid for the socket
1626 * option.
1627 */
1628#define SDP_WLOCK_RECHECK(inp) do {					\
1629	SDP_WLOCK(ssk);							\
1630	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1631		SDP_WUNLOCK(ssk);					\
1632		return (ECONNRESET);					\
1633	}								\
1634} while(0)
1635
1636static int
1637sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1638{
1639	int	error, opt, optval;
1640	struct sdp_sock *ssk;
1641
1642	error = 0;
1643	ssk = sdp_sk(so);
1644	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1645		SDP_WLOCK(ssk);
1646		if (so->so_options & SO_KEEPALIVE)
1647			sdp_start_keepalive_timer(so);
1648		else
1649			sdp_stop_keepalive_timer(so);
1650		SDP_WUNLOCK(ssk);
1651	}
1652	if (sopt->sopt_level != IPPROTO_TCP)
1653		return (error);
1654
1655	SDP_WLOCK(ssk);
1656	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1657		SDP_WUNLOCK(ssk);
1658		return (ECONNRESET);
1659	}
1660
1661	switch (sopt->sopt_dir) {
1662	case SOPT_SET:
1663		switch (sopt->sopt_name) {
1664		case TCP_NODELAY:
1665			SDP_WUNLOCK(ssk);
1666			error = sooptcopyin(sopt, &optval, sizeof optval,
1667			    sizeof optval);
1668			if (error)
1669				return (error);
1670
1671			SDP_WLOCK_RECHECK(ssk);
1672			opt = SDP_NODELAY;
1673			if (optval)
1674				ssk->flags |= opt;
1675			else
1676				ssk->flags &= ~opt;
1677			sdp_do_posts(ssk);
1678			SDP_WUNLOCK(ssk);
1679			break;
1680
1681		default:
1682			SDP_WUNLOCK(ssk);
1683			error = ENOPROTOOPT;
1684			break;
1685		}
1686		break;
1687
1688	case SOPT_GET:
1689		switch (sopt->sopt_name) {
1690		case TCP_NODELAY:
1691			optval = ssk->flags & SDP_NODELAY;
1692			SDP_WUNLOCK(ssk);
1693			error = sooptcopyout(sopt, &optval, sizeof optval);
1694			break;
1695		default:
1696			SDP_WUNLOCK(ssk);
1697			error = ENOPROTOOPT;
1698			break;
1699		}
1700		break;
1701	}
1702	return (error);
1703}
1704#undef SDP_WLOCK_RECHECK
1705
1706int sdp_mod_count = 0;
1707int sdp_mod_usec = 0;
1708
1709void
1710sdp_set_default_moderation(struct sdp_sock *ssk)
1711{
1712	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1713		return;
1714	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1715}
1716
1717static void
1718sdp_dev_add(struct ib_device *device)
1719{
1720	struct ib_fmr_pool_param param;
1721	struct sdp_device *sdp_dev;
1722
1723	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1724	sdp_dev->pd = ib_alloc_pd(device, 0);
1725	if (IS_ERR(sdp_dev->pd))
1726		goto out_pd;
1727	memset(&param, 0, sizeof param);
1728	param.max_pages_per_fmr = SDP_FMR_SIZE;
1729	param.page_shift = PAGE_SHIFT;
1730	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1731	param.pool_size = SDP_FMR_POOL_SIZE;
1732	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1733	param.cache = 1;
1734	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1735	if (IS_ERR(sdp_dev->fmr_pool))
1736		goto out_fmr;
1737	ib_set_client_data(device, &sdp_client, sdp_dev);
1738	return;
1739
1740out_fmr:
1741	ib_dealloc_pd(sdp_dev->pd);
1742out_pd:
1743	free(sdp_dev, M_SDP);
1744}
1745
1746static void
1747sdp_dev_rem(struct ib_device *device, void *client_data)
1748{
1749	struct sdp_device *sdp_dev;
1750	struct sdp_sock *ssk;
1751
1752	SDP_LIST_WLOCK();
1753	LIST_FOREACH(ssk, &sdp_list, list) {
1754		if (ssk->ib_device != device)
1755			continue;
1756		SDP_WLOCK(ssk);
1757		if ((ssk->flags & SDP_DESTROY) == 0)
1758			ssk = sdp_notify(ssk, ECONNRESET);
1759		if (ssk)
1760			SDP_WUNLOCK(ssk);
1761	}
1762	SDP_LIST_WUNLOCK();
1763	/*
1764	 * XXX Do I need to wait between these two?
1765	 */
1766	sdp_dev = ib_get_client_data(device, &sdp_client);
1767	if (!sdp_dev)
1768		return;
1769	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1770	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1771	ib_dealloc_pd(sdp_dev->pd);
1772	free(sdp_dev, M_SDP);
1773}
1774
1775struct ib_client sdp_client =
1776    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1777
1778
1779static int
1780sdp_pcblist(SYSCTL_HANDLER_ARGS)
1781{
1782	int error, n, i;
1783	struct sdp_sock *ssk;
1784	struct xinpgen xig;
1785
1786	/*
1787	 * The process of preparing the TCB list is too time-consuming and
1788	 * resource-intensive to repeat twice on every request.
1789	 */
1790	if (req->oldptr == NULL) {
1791		n = sdp_count;
1792		n += imax(n / 8, 10);
1793		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1794		return (0);
1795	}
1796
1797	if (req->newptr != NULL)
1798		return (EPERM);
1799
1800	/*
1801	 * OK, now we're committed to doing something.
1802	 */
1803	SDP_LIST_RLOCK();
1804	n = sdp_count;
1805	SDP_LIST_RUNLOCK();
1806
1807	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1808		+ n * sizeof(struct xtcpcb));
1809	if (error != 0)
1810		return (error);
1811
1812	bzero(&xig, sizeof(xig));
1813	xig.xig_len = sizeof xig;
1814	xig.xig_count = n;
1815	xig.xig_gen = 0;
1816	xig.xig_sogen = so_gencnt;
1817	error = SYSCTL_OUT(req, &xig, sizeof xig);
1818	if (error)
1819		return (error);
1820
1821	SDP_LIST_RLOCK();
1822	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1823	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1824		struct xtcpcb xt;
1825
1826		SDP_RLOCK(ssk);
1827		if (ssk->flags & SDP_TIMEWAIT) {
1828			if (ssk->cred != NULL)
1829				error = cr_cansee(req->td->td_ucred,
1830				    ssk->cred);
1831			else
1832				error = EINVAL;	/* Skip this inp. */
1833		} else if (ssk->socket)
1834			error = cr_canseesocket(req->td->td_ucred,
1835			    ssk->socket);
1836		else
1837			error = EINVAL;
1838		if (error) {
1839			error = 0;
1840			goto next;
1841		}
1842
1843		bzero(&xt, sizeof(xt));
1844		xt.xt_len = sizeof xt;
1845		xt.xt_inp.inp_gencnt = 0;
1846		xt.xt_inp.inp_vflag = INP_IPV4;
1847		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1848		xt.xt_inp.inp_lport = ssk->lport;
1849		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1850		xt.xt_inp.inp_fport = ssk->fport;
1851		xt.t_state = ssk->state;
1852		if (ssk->socket != NULL)
1853			sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
1854		xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
1855		SDP_RUNLOCK(ssk);
1856		error = SYSCTL_OUT(req, &xt, sizeof xt);
1857		if (error)
1858			break;
1859		i++;
1860		continue;
1861next:
1862		SDP_RUNLOCK(ssk);
1863	}
1864	if (!error) {
1865		/*
1866		 * Give the user an updated idea of our state.
1867		 * If the generation differs from what we told
1868		 * her before, she knows that something happened
1869		 * while we were processing this request, and it
1870		 * might be necessary to retry.
1871		 */
1872		xig.xig_gen = 0;
1873		xig.xig_sogen = so_gencnt;
1874		xig.xig_count = sdp_count;
1875		error = SYSCTL_OUT(req, &xig, sizeof xig);
1876	}
1877	SDP_LIST_RUNLOCK();
1878	return (error);
1879}
1880
1881static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1882
1883SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1884    CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1885    "List of active SDP connections");
1886
1887static void
1888sdp_zone_change(void *tag)
1889{
1890
1891	uma_zone_set_max(sdp_zone, maxsockets);
1892}
1893
1894static void
1895sdp_init(void)
1896{
1897
1898	LIST_INIT(&sdp_list);
1899	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1900	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1901	uma_zone_set_max(sdp_zone, maxsockets);
1902	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1903		EVENTHANDLER_PRI_ANY);
1904	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1905	ib_register_client(&sdp_client);
1906}
1907
1908extern struct domain sdpdomain;
1909
1910struct pr_usrreqs sdp_usrreqs = {
1911	.pru_abort =		sdp_abort,
1912	.pru_accept =		sdp_accept,
1913	.pru_attach =		sdp_attach,
1914	.pru_bind =		sdp_bind,
1915	.pru_connect =		sdp_connect,
1916	.pru_control =		sdp_control,
1917	.pru_detach =		sdp_detach,
1918	.pru_disconnect =	sdp_disconnect,
1919	.pru_listen =		sdp_listen,
1920	.pru_peeraddr =		sdp_getpeeraddr,
1921	.pru_rcvoob =		sdp_rcvoob,
1922	.pru_send =		sdp_send,
1923	.pru_sosend =		sdp_sosend,
1924	.pru_soreceive =	sdp_sorecv,
1925	.pru_shutdown =		sdp_shutdown,
1926	.pru_sockaddr =		sdp_getsockaddr,
1927	.pru_close =		sdp_close,
1928};
1929
1930struct protosw sdpsw[] = {
1931{
1932	.pr_type =		SOCK_STREAM,
1933	.pr_domain =		&sdpdomain,
1934	.pr_protocol =		IPPROTO_IP,
1935	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1936	.pr_ctlinput =		sdp_ctlinput,
1937	.pr_ctloutput =		sdp_ctloutput,
1938	.pr_usrreqs =		&sdp_usrreqs
1939},
1940{
1941	.pr_type =		SOCK_STREAM,
1942	.pr_domain =		&sdpdomain,
1943	.pr_protocol =		IPPROTO_TCP,
1944	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1945	.pr_ctlinput =		sdp_ctlinput,
1946	.pr_ctloutput =		sdp_ctloutput,
1947	.pr_usrreqs =		&sdp_usrreqs
1948},
1949};
1950
1951struct domain sdpdomain = {
1952	.dom_family =		AF_INET_SDP,
1953	.dom_name =		"SDP",
1954	.dom_init =		sdp_init,
1955	.dom_protosw =		sdpsw,
1956	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1957};
1958
1959DOMAIN_SET(sdp);
1960
1961int sdp_debug_level = 1;
1962int sdp_data_debug_level = 0;
1963